You can subscribe to this list here.
2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
(33) |
Nov
(325) |
Dec
(320) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2007 |
Jan
(484) |
Feb
(438) |
Mar
(407) |
Apr
(713) |
May
(831) |
Jun
(806) |
Jul
(1023) |
Aug
(1184) |
Sep
(1118) |
Oct
(1461) |
Nov
(1224) |
Dec
(1042) |
2008 |
Jan
(1449) |
Feb
(1110) |
Mar
(1428) |
Apr
(1643) |
May
(682) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Kay, A. M <all...@in...> - 2008-05-05 21:37:40
|
Intel-iommu driver changes for kvm vt-d support. Important changes are in intel-iommu.c. The rest of the changes are for moving intel-iommu.h and iova.h from drivers/pci directory to include/linux directory. Signed-off-by: Allen M Kay <all...@in...> ---- b/drivers/pci/dmar.c | 4 b/drivers/pci/intel-iommu.c | 26 ++- b/drivers/pci/iova.c | 2 b/include/linux/intel-iommu.h | 344 ++++++++++++++++++++++++++++++++++++++++++ b/include/linux/iova.h | 52 ++++++ drivers/pci/intel-iommu.h | 344 ------------------------------------------ drivers/pci/iova.h | 52 ------ 7 files changed, 416 insertions(+), 408 deletions(-) ---- diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index f941f60..a58a5b0 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -26,8 +26,8 @@ #include <linux/pci.h> #include <linux/dmar.h> -#include "iova.h" -#include "intel-iommu.h" +#include <linux/iova.h> +#include <linux/intel-iommu.h> #undef PREFIX #define PREFIX "DMAR:" diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 4cb949f..bfa888b 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -31,8 +31,8 @@ #include <linux/dmar.h> #include <linux/dma-mapping.h> #include <linux/mempool.h> -#include "iova.h" -#include "intel-iommu.h" +#include <linux/iova.h> +#include <linux/intel-iommu.h> #include <asm/proto.h> /* force_iommu in this header in x86-64*/ #include <asm/cacheflush.h> #include <asm/gart.h> @@ -1056,7 +1056,7 @@ static void free_iommu(struct intel_iommu *iommu) kfree(iommu); } -static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) { unsigned long num; unsigned long ndomains; @@ -1086,8 +1086,9 @@ static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) return domain; } +EXPORT_SYMBOL_GPL(iommu_alloc_domain); -static void iommu_free_domain(struct dmar_domain *domain) +void iommu_free_domain(struct dmar_domain *domain) { unsigned long flags; @@ -1095,6 +1096,7 @@ static void iommu_free_domain(struct dmar_domain *domain) clear_bit(domain->id, domain->iommu->domain_ids); spin_unlock_irqrestore(&domain->iommu->lock, flags); } +EXPORT_SYMBOL_GPL(iommu_free_domain); static struct iova_domain reserved_iova_list; static struct lock_class_key reserved_alloc_key; @@ -1160,7 +1162,7 @@ static inline int guestwidth_to_adjustwidth(int gaw) return agaw; } -static int domain_init(struct dmar_domain *domain, int guest_width) +int domain_init(struct dmar_domain *domain, int guest_width) { struct intel_iommu *iommu; int adjust_width, agaw; @@ -1196,6 +1198,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width) __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K); return 0; } +EXPORT_SYMBOL_GPL(domain_init); static void domain_exit(struct dmar_domain *domain) { @@ -1258,7 +1261,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, return 0; } -static int +int domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) { int ret; @@ -1289,6 +1292,7 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) return domain_context_mapping_one(domain, tmp->bus->number, tmp->devfn); } +EXPORT_SYMBOL_GPL(domain_context_mapping); static int domain_context_mapped(struct dmar_domain *domain, struct pci_dev *pdev) @@ -1321,7 +1325,7 @@ static int domain_context_mapped(struct dmar_domain *domain, tmp->bus->number, tmp->devfn); } -static int +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, u64 hpa, size_t size, int prot) { @@ -1351,13 +1355,15 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, } return 0; } +EXPORT_SYMBOL_GPL(domain_page_mapping); -static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn) +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn) { clear_context_table(domain->iommu, bus, devfn); iommu_flush_context_global(domain->iommu, 0); iommu_flush_iotlb_global(domain->iommu, 0); } +EXPORT_SYMBOL_GPL(detach_domain_for_dev); static void domain_remove_dev_info(struct dmar_domain *domain) { @@ -1397,6 +1403,7 @@ find_domain(struct pci_dev *pdev) return info->domain; return NULL; } +EXPORT_SYMBOL_GPL(find_domain); static int dmar_pci_device_match(struct pci_dev *devices[], int cnt, struct pci_dev *dev) @@ -1415,7 +1422,7 @@ static int dmar_pci_device_match(struct pci_dev *devices[], int cnt, return 0; } -static struct dmar_drhd_unit * +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev) { struct dmar_drhd_unit *drhd = NULL; @@ -1428,6 +1435,7 @@ dmar_find_matched_drhd_unit(struct pci_dev *dev) return NULL; } +EXPORT_SYMBOL_GPL(dmar_find_matched_drhd_unit); /* domain is initialized */ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) diff --git a/drivers/pci/intel-iommu.h b/drivers/pci/intel-iommu.h deleted file mode 100644 index afc0ad9..0000000 --- a/drivers/pci/intel-iommu.h +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Author: Ashok Raj <ash...@in...> - * Author: Anil S Keshavamurthy <ani...@in...> - */ - -#ifndef _INTEL_IOMMU_H_ -#define _INTEL_IOMMU_H_ - -#include <linux/types.h> -#include <linux/msi.h> -#include <linux/sysdev.h> -#include "iova.h" -#include <linux/io.h> - -/* - * We need a fixed PAGE_SIZE of 4K irrespective of - * arch PAGE_SIZE for IOMMU page tables. - */ -#define PAGE_SHIFT_4K (12) -#define PAGE_SIZE_4K (1UL << PAGE_SHIFT_4K) -#define PAGE_MASK_4K (((u64)-1) << PAGE_SHIFT_4K) -#define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K) - -#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT_4K) -#define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK) -#define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK) - -/* - * Intel IOMMU register specification per version 1.0 public spec. - */ - -#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ -#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ -#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ -#define DMAR_GCMD_REG 0x18 /* Global command register */ -#define DMAR_GSTS_REG 0x1c /* Global status register */ -#define DMAR_RTADDR_REG 0x20 /* Root entry table */ -#define DMAR_CCMD_REG 0x28 /* Context command reg */ -#define DMAR_FSTS_REG 0x34 /* Fault Status register */ -#define DMAR_FECTL_REG 0x38 /* Fault control register */ -#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ -#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ -#define DMAR_FEUADDR_REG 0x44 /* Upper address register */ -#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ -#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ -#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ -#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ -#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ -#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ - -#define OFFSET_STRIDE (9) -/* -#define dmar_readl(dmar, reg) readl(dmar + reg) -#define dmar_readq(dmar, reg) ({ \ - u32 lo, hi; \ - lo = readl(dmar + reg); \ - hi = readl(dmar + reg + 4); \ - (((u64) hi) << 32) + lo; }) -*/ -static inline u64 dmar_readq(void __iomem *addr) -{ - u32 lo, hi; - lo = readl(addr); - hi = readl(addr + 4); - return (((u64) hi) << 32) + lo; -} - -static inline void dmar_writeq(void __iomem *addr, u64 val) -{ - writel((u32)val, addr); - writel((u32)(val >> 32), addr + 4); -} - -#define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) -#define DMAR_VER_MINOR(v) ((v) & 0x0f) - -/* - * Decoding Capability Register - */ -#define cap_read_drain(c) (((c) >> 55) & 1) -#define cap_write_drain(c) (((c) >> 54) & 1) -#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) -#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) -#define cap_pgsel_inv(c) (((c) >> 39) & 1) - -#define cap_super_page_val(c) (((c) >> 34) & 0xf) -#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ - * OFFSET_STRIDE) + 21) - -#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) -#define cap_max_fault_reg_offset(c) \ - (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16) - -#define cap_zlr(c) (((c) >> 22) & 1) -#define cap_isoch(c) (((c) >> 23) & 1) -#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) -#define cap_sagaw(c) (((c) >> 8) & 0x1f) -#define cap_caching_mode(c) (((c) >> 7) & 1) -#define cap_phmr(c) (((c) >> 6) & 1) -#define cap_plmr(c) (((c) >> 5) & 1) -#define cap_rwbf(c) (((c) >> 4) & 1) -#define cap_afl(c) (((c) >> 3) & 1) -#define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7))) -/* - * Extended Capability Register - */ - -#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1) -#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) -#define ecap_max_iotlb_offset(e) \ - (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16) -#define ecap_coherent(e) ((e) & 0x1) - - -/* IOTLB_REG */ -#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) -#define DMA_TLB_DSI_FLUSH (((u64)2) << 60) -#define DMA_TLB_PSI_FLUSH (((u64)3) << 60) -#define DMA_TLB_IIRG(type) ((type >> 60) & 7) -#define DMA_TLB_IAIG(val) (((val) >> 57) & 7) -#define DMA_TLB_READ_DRAIN (((u64)1) << 49) -#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) -#define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32) -#define DMA_TLB_IVT (((u64)1) << 63) -#define DMA_TLB_IH_NONLEAF (((u64)1) << 6) -#define DMA_TLB_MAX_SIZE (0x3f) - -/* PMEN_REG */ -#define DMA_PMEN_EPM (((u32)1)<<31) -#define DMA_PMEN_PRS (((u32)1)<<0) - -/* GCMD_REG */ -#define DMA_GCMD_TE (((u32)1) << 31) -#define DMA_GCMD_SRTP (((u32)1) << 30) -#define DMA_GCMD_SFL (((u32)1) << 29) -#define DMA_GCMD_EAFL (((u32)1) << 28) -#define DMA_GCMD_WBF (((u32)1) << 27) - -/* GSTS_REG */ -#define DMA_GSTS_TES (((u32)1) << 31) -#define DMA_GSTS_RTPS (((u32)1) << 30) -#define DMA_GSTS_FLS (((u32)1) << 29) -#define DMA_GSTS_AFLS (((u32)1) << 28) -#define DMA_GSTS_WBFS (((u32)1) << 27) - -/* CCMD_REG */ -#define DMA_CCMD_ICC (((u64)1) << 63) -#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) -#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) -#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) -#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) -#define DMA_CCMD_MASK_NOBIT 0 -#define DMA_CCMD_MASK_1BIT 1 -#define DMA_CCMD_MASK_2BIT 2 -#define DMA_CCMD_MASK_3BIT 3 -#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) -#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) - -/* FECTL_REG */ -#define DMA_FECTL_IM (((u32)1) << 31) - -/* FSTS_REG */ -#define DMA_FSTS_PPF ((u32)2) -#define DMA_FSTS_PFO ((u32)1) -#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) - -/* FRCD_REG, 32 bits access */ -#define DMA_FRCD_F (((u32)1) << 31) -#define dma_frcd_type(d) ((d >> 30) & 1) -#define dma_frcd_fault_reason(c) (c & 0xff) -#define dma_frcd_source_id(c) (c & 0xffff) -#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */ - -/* - * 0: Present - * 1-11: Reserved - * 12-63: Context Ptr (12 - (haw-1)) - * 64-127: Reserved - */ -struct root_entry { - u64 val; - u64 rsvd1; -}; -#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) -static inline bool root_present(struct root_entry *root) -{ - return (root->val & 1); -} -static inline void set_root_present(struct root_entry *root) -{ - root->val |= 1; -} -static inline void set_root_value(struct root_entry *root, unsigned long value) -{ - root->val |= value & PAGE_MASK_4K; -} - -struct context_entry; -static inline struct context_entry * -get_context_addr_from_root(struct root_entry *root) -{ - return (struct context_entry *) - (root_present(root)?phys_to_virt( - root->val & PAGE_MASK_4K): - NULL); -} - -/* - * low 64 bits: - * 0: present - * 1: fault processing disable - * 2-3: translation type - * 12-63: address space root - * high 64 bits: - * 0-2: address width - * 3-6: aval - * 8-23: domain id - */ -struct context_entry { - u64 lo; - u64 hi; -}; -#define context_present(c) ((c).lo & 1) -#define context_fault_disable(c) (((c).lo >> 1) & 1) -#define context_translation_type(c) (((c).lo >> 2) & 3) -#define context_address_root(c) ((c).lo & PAGE_MASK_4K) -#define context_address_width(c) ((c).hi & 7) -#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1)) - -#define context_set_present(c) do {(c).lo |= 1;} while (0) -#define context_set_fault_enable(c) \ - do {(c).lo &= (((u64)-1) << 2) | 1;} while (0) -#define context_set_translation_type(c, val) \ - do { \ - (c).lo &= (((u64)-1) << 4) | 3; \ - (c).lo |= ((val) & 3) << 2; \ - } while (0) -#define CONTEXT_TT_MULTI_LEVEL 0 -#define context_set_address_root(c, val) \ - do {(c).lo |= (val) & PAGE_MASK_4K;} while (0) -#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0) -#define context_set_domain_id(c, val) \ - do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0) -#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0) - -/* - * 0: readable - * 1: writable - * 2-6: reserved - * 7: super page - * 8-11: available - * 12-63: Host physcial address - */ -struct dma_pte { - u64 val; -}; -#define dma_clear_pte(p) do {(p).val = 0;} while (0) - -#define DMA_PTE_READ (1) -#define DMA_PTE_WRITE (2) - -#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0) -#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0) -#define dma_set_pte_prot(p, prot) \ - do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) -#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) -#define dma_set_pte_addr(p, addr) do {\ - (p).val |= ((addr) & PAGE_MASK_4K); } while (0) -#define dma_pte_present(p) (((p).val & 3) != 0) - -struct intel_iommu; - -struct dmar_domain { - int id; /* domain id */ - struct intel_iommu *iommu; /* back pointer to owning iommu */ - - struct list_head devices; /* all devices' list */ - struct iova_domain iovad; /* iova's that belong to this domain */ - - struct dma_pte *pgd; /* virtual address */ - spinlock_t mapping_lock; /* page table lock */ - int gaw; /* max guest address width */ - - /* adjusted guest address width, 0 is level 2 30-bit */ - int agaw; - -#define DOMAIN_FLAG_MULTIPLE_DEVICES 1 - int flags; -}; - -/* PCI domain-device relationship */ -struct device_domain_info { - struct list_head link; /* link to domain siblings */ - struct list_head global; /* link to global list */ - u8 bus; /* PCI bus numer */ - u8 devfn; /* PCI devfn number */ - struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ - struct dmar_domain *domain; /* pointer to domain */ -}; - -extern int init_dmars(void); - -struct intel_iommu { - void __iomem *reg; /* Pointer to hardware regs, virtual addr */ - u64 cap; - u64 ecap; - unsigned long *domain_ids; /* bitmap of domains */ - struct dmar_domain **domains; /* ptr to domains */ - int seg; - u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ - spinlock_t lock; /* protect context, domain ids */ - spinlock_t register_lock; /* protect register handling */ - struct root_entry *root_entry; /* virtual address */ - - unsigned int irq; - unsigned char name[7]; /* Device Name */ - struct msi_msg saved_msg; - struct sys_device sysdev; -}; - -#ifndef CONFIG_DMAR_GFX_WA -static inline void iommu_prepare_gfx_mapping(void) -{ - return; -} -#endif /* !CONFIG_DMAR_GFX_WA */ - -#endif diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c index dbcdd6b..0dfac4b 100644 --- a/drivers/pci/iova.c +++ b/drivers/pci/iova.c @@ -7,7 +7,7 @@ * Author: Anil S Keshavamurthy <ani...@in...> */ -#include "iova.h" +#include <linux/iova.h> void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit) diff --git a/drivers/pci/iova.h b/drivers/pci/iova.h deleted file mode 100644 index 228f6c9..0000000 --- a/drivers/pci/iova.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This file is released under the GPLv2. - * - * Copyright (C) 2006-2008 Intel Corporation - * Author: Anil S Keshavamurthy <ani...@in...> - * - */ - -#ifndef _IOVA_H_ -#define _IOVA_H_ - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/rbtree.h> -#include <linux/dma-mapping.h> - -/* IO virtual address start page frame number */ -#define IOVA_START_PFN (1) - -/* iova structure */ -struct iova { - struct rb_node node; - unsigned long pfn_hi; /* IOMMU dish out addr hi */ - unsigned long pfn_lo; /* IOMMU dish out addr lo */ -}; - -/* holds all the iova translations for a domain */ -struct iova_domain { - spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */ - spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */ - struct rb_root rbroot; /* iova domain rbtree root */ - struct rb_node *cached32_node; /* Save last alloced node */ - unsigned long dma_32bit_pfn; -}; - -struct iova *alloc_iova_mem(void); -void free_iova_mem(struct iova *iova); -void free_iova(struct iova_domain *iovad, unsigned long pfn); -void __free_iova(struct iova_domain *iovad, struct iova *iova); -struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size, - unsigned long limit_pfn, - bool size_aligned); -struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, - unsigned long pfn_hi); -void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to); -void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit); -struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); -void put_iova_domain(struct iova_domain *iovad); - -#endif diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h new file mode 100644 index 0000000..afc0ad9 --- /dev/null +++ b/include/linux/intel-iommu.h @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Author: Ashok Raj <ash...@in...> + * Author: Anil S Keshavamurthy <ani...@in...> + */ + +#ifndef _INTEL_IOMMU_H_ +#define _INTEL_IOMMU_H_ + +#include <linux/types.h> +#include <linux/msi.h> +#include <linux/sysdev.h> +#include "iova.h" +#include <linux/io.h> + +/* + * We need a fixed PAGE_SIZE of 4K irrespective of + * arch PAGE_SIZE for IOMMU page tables. + */ +#define PAGE_SHIFT_4K (12) +#define PAGE_SIZE_4K (1UL << PAGE_SHIFT_4K) +#define PAGE_MASK_4K (((u64)-1) << PAGE_SHIFT_4K) +#define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K) + +#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT_4K) +#define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK) +#define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK) + +/* + * Intel IOMMU register specification per version 1.0 public spec. + */ + +#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ +#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ +#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ +#define DMAR_GCMD_REG 0x18 /* Global command register */ +#define DMAR_GSTS_REG 0x1c /* Global status register */ +#define DMAR_RTADDR_REG 0x20 /* Root entry table */ +#define DMAR_CCMD_REG 0x28 /* Context command reg */ +#define DMAR_FSTS_REG 0x34 /* Fault Status register */ +#define DMAR_FECTL_REG 0x38 /* Fault control register */ +#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ +#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ +#define DMAR_FEUADDR_REG 0x44 /* Upper address register */ +#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ +#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ +#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ +#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ +#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ +#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ + +#define OFFSET_STRIDE (9) +/* +#define dmar_readl(dmar, reg) readl(dmar + reg) +#define dmar_readq(dmar, reg) ({ \ + u32 lo, hi; \ + lo = readl(dmar + reg); \ + hi = readl(dmar + reg + 4); \ + (((u64) hi) << 32) + lo; }) +*/ +static inline u64 dmar_readq(void __iomem *addr) +{ + u32 lo, hi; + lo = readl(addr); + hi = readl(addr + 4); + return (((u64) hi) << 32) + lo; +} + +static inline void dmar_writeq(void __iomem *addr, u64 val) +{ + writel((u32)val, addr); + writel((u32)(val >> 32), addr + 4); +} + +#define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) +#define DMAR_VER_MINOR(v) ((v) & 0x0f) + +/* + * Decoding Capability Register + */ +#define cap_read_drain(c) (((c) >> 55) & 1) +#define cap_write_drain(c) (((c) >> 54) & 1) +#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) +#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) +#define cap_pgsel_inv(c) (((c) >> 39) & 1) + +#define cap_super_page_val(c) (((c) >> 34) & 0xf) +#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ + * OFFSET_STRIDE) + 21) + +#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) +#define cap_max_fault_reg_offset(c) \ + (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16) + +#define cap_zlr(c) (((c) >> 22) & 1) +#define cap_isoch(c) (((c) >> 23) & 1) +#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) +#define cap_sagaw(c) (((c) >> 8) & 0x1f) +#define cap_caching_mode(c) (((c) >> 7) & 1) +#define cap_phmr(c) (((c) >> 6) & 1) +#define cap_plmr(c) (((c) >> 5) & 1) +#define cap_rwbf(c) (((c) >> 4) & 1) +#define cap_afl(c) (((c) >> 3) & 1) +#define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7))) +/* + * Extended Capability Register + */ + +#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1) +#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) +#define ecap_max_iotlb_offset(e) \ + (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16) +#define ecap_coherent(e) ((e) & 0x1) + + +/* IOTLB_REG */ +#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) +#define DMA_TLB_DSI_FLUSH (((u64)2) << 60) +#define DMA_TLB_PSI_FLUSH (((u64)3) << 60) +#define DMA_TLB_IIRG(type) ((type >> 60) & 7) +#define DMA_TLB_IAIG(val) (((val) >> 57) & 7) +#define DMA_TLB_READ_DRAIN (((u64)1) << 49) +#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) +#define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32) +#define DMA_TLB_IVT (((u64)1) << 63) +#define DMA_TLB_IH_NONLEAF (((u64)1) << 6) +#define DMA_TLB_MAX_SIZE (0x3f) + +/* PMEN_REG */ +#define DMA_PMEN_EPM (((u32)1)<<31) +#define DMA_PMEN_PRS (((u32)1)<<0) + +/* GCMD_REG */ +#define DMA_GCMD_TE (((u32)1) << 31) +#define DMA_GCMD_SRTP (((u32)1) << 30) +#define DMA_GCMD_SFL (((u32)1) << 29) +#define DMA_GCMD_EAFL (((u32)1) << 28) +#define DMA_GCMD_WBF (((u32)1) << 27) + +/* GSTS_REG */ +#define DMA_GSTS_TES (((u32)1) << 31) +#define DMA_GSTS_RTPS (((u32)1) << 30) +#define DMA_GSTS_FLS (((u32)1) << 29) +#define DMA_GSTS_AFLS (((u32)1) << 28) +#define DMA_GSTS_WBFS (((u32)1) << 27) + +/* CCMD_REG */ +#define DMA_CCMD_ICC (((u64)1) << 63) +#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) +#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) +#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) +#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) +#define DMA_CCMD_MASK_NOBIT 0 +#define DMA_CCMD_MASK_1BIT 1 +#define DMA_CCMD_MASK_2BIT 2 +#define DMA_CCMD_MASK_3BIT 3 +#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) +#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) + +/* FECTL_REG */ +#define DMA_FECTL_IM (((u32)1) << 31) + +/* FSTS_REG */ +#define DMA_FSTS_PPF ((u32)2) +#define DMA_FSTS_PFO ((u32)1) +#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) + +/* FRCD_REG, 32 bits access */ +#define DMA_FRCD_F (((u32)1) << 31) +#define dma_frcd_type(d) ((d >> 30) & 1) +#define dma_frcd_fault_reason(c) (c & 0xff) +#define dma_frcd_source_id(c) (c & 0xffff) +#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */ + +/* + * 0: Present + * 1-11: Reserved + * 12-63: Context Ptr (12 - (haw-1)) + * 64-127: Reserved + */ +struct root_entry { + u64 val; + u64 rsvd1; +}; +#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) +static inline bool root_present(struct root_entry *root) +{ + return (root->val & 1); +} +static inline void set_root_present(struct root_entry *root) +{ + root->val |= 1; +} +static inline void set_root_value(struct root_entry *root, unsigned long value) +{ + root->val |= value & PAGE_MASK_4K; +} + +struct context_entry; +static inline struct context_entry * +get_context_addr_from_root(struct root_entry *root) +{ + return (struct context_entry *) + (root_present(root)?phys_to_virt( + root->val & PAGE_MASK_4K): + NULL); +} + +/* + * low 64 bits: + * 0: present + * 1: fault processing disable + * 2-3: translation type + * 12-63: address space root + * high 64 bits: + * 0-2: address width + * 3-6: aval + * 8-23: domain id + */ +struct context_entry { + u64 lo; + u64 hi; +}; +#define context_present(c) ((c).lo & 1) +#define context_fault_disable(c) (((c).lo >> 1) & 1) +#define context_translation_type(c) (((c).lo >> 2) & 3) +#define context_address_root(c) ((c).lo & PAGE_MASK_4K) +#define context_address_width(c) ((c).hi & 7) +#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1)) + +#define context_set_present(c) do {(c).lo |= 1;} while (0) +#define context_set_fault_enable(c) \ + do {(c).lo &= (((u64)-1) << 2) | 1;} while (0) +#define context_set_translation_type(c, val) \ + do { \ + (c).lo &= (((u64)-1) << 4) | 3; \ + (c).lo |= ((val) & 3) << 2; \ + } while (0) +#define CONTEXT_TT_MULTI_LEVEL 0 +#define context_set_address_root(c, val) \ + do {(c).lo |= (val) & PAGE_MASK_4K;} while (0) +#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0) +#define context_set_domain_id(c, val) \ + do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0) +#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0) + +/* + * 0: readable + * 1: writable + * 2-6: reserved + * 7: super page + * 8-11: available + * 12-63: Host physcial address + */ +struct dma_pte { + u64 val; +}; +#define dma_clear_pte(p) do {(p).val = 0;} while (0) + +#define DMA_PTE_READ (1) +#define DMA_PTE_WRITE (2) + +#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0) +#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0) +#define dma_set_pte_prot(p, prot) \ + do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) +#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) +#define dma_set_pte_addr(p, addr) do {\ + (p).val |= ((addr) & PAGE_MASK_4K); } while (0) +#define dma_pte_present(p) (((p).val & 3) != 0) + +struct intel_iommu; + +struct dmar_domain { + int id; /* domain id */ + struct intel_iommu *iommu; /* back pointer to owning iommu */ + + struct list_head devices; /* all devices' list */ + struct iova_domain iovad; /* iova's that belong to this domain */ + + struct dma_pte *pgd; /* virtual address */ + spinlock_t mapping_lock; /* page table lock */ + int gaw; /* max guest address width */ + + /* adjusted guest address width, 0 is level 2 30-bit */ + int agaw; + +#define DOMAIN_FLAG_MULTIPLE_DEVICES 1 + int flags; +}; + +/* PCI domain-device relationship */ +struct device_domain_info { + struct list_head link; /* link to domain siblings */ + struct list_head global; /* link to global list */ + u8 bus; /* PCI bus numer */ + u8 devfn; /* PCI devfn number */ + struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ + struct dmar_domain *domain; /* pointer to domain */ +}; + +extern int init_dmars(void); + +struct intel_iommu { + void __iomem *reg; /* Pointer to hardware regs, virtual addr */ + u64 cap; + u64 ecap; + unsigned long *domain_ids; /* bitmap of domains */ + struct dmar_domain **domains; /* ptr to domains */ + int seg; + u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ + spinlock_t lock; /* protect context, domain ids */ + spinlock_t register_lock; /* protect register handling */ + struct root_entry *root_entry; /* virtual address */ + + unsigned int irq; + unsigned char name[7]; /* Device Name */ + struct msi_msg saved_msg; + struct sys_device sysdev; +}; + +#ifndef CONFIG_DMAR_GFX_WA +static inline void iommu_prepare_gfx_mapping(void) +{ + return; +} +#endif /* !CONFIG_DMAR_GFX_WA */ + +#endif diff --git a/include/linux/iova.h b/include/linux/iova.h new file mode 100644 index 0000000..228f6c9 --- /dev/null +++ b/include/linux/iova.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This file is released under the GPLv2. + * + * Copyright (C) 2006-2008 Intel Corporation + * Author: Anil S Keshavamurthy <ani...@in...> + * + */ + +#ifndef _IOVA_H_ +#define _IOVA_H_ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/rbtree.h> +#include <linux/dma-mapping.h> + +/* IO virtual address start page frame number */ +#define IOVA_START_PFN (1) + +/* iova structure */ +struct iova { + struct rb_node node; + unsigned long pfn_hi; /* IOMMU dish out addr hi */ + unsigned long pfn_lo; /* IOMMU dish out addr lo */ +}; + +/* holds all the iova translations for a domain */ +struct iova_domain { + spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */ + spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */ + struct rb_root rbroot; /* iova domain rbtree root */ + struct rb_node *cached32_node; /* Save last alloced node */ + unsigned long dma_32bit_pfn; +}; + +struct iova *alloc_iova_mem(void); +void free_iova_mem(struct iova *iova); +void free_iova(struct iova_domain *iovad, unsigned long pfn); +void __free_iova(struct iova_domain *iovad, struct iova *iova); +struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size, + unsigned long limit_pfn, + bool size_aligned); +struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, + unsigned long pfn_hi); +void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to); +void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit); +struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); +void put_iova_domain(struct iova_domain *iovad); + +#endif |
From: Kay, A. M <all...@in...> - 2008-05-05 21:37:17
|
Kvm-user-mode patch. Still todo: move vt.d to kvm-intel.ko module. Signed-off-by: Allen M Kay <all...@in...> ----- Kbuild | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) ----- diff --git a/kernel/Kbuild b/kernel/Kbuild index e3e97ab..7455605 100644 --- a/kernel/Kbuild +++ b/kernel/Kbuild @@ -1,7 +1,7 @@ EXTRA_CFLAGS := -I$(src)/include -include $(src)/external-module-compat.h obj-m := kvm.o kvm-intel.o kvm-amd.o kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o anon_inodes.o irq.o i8259.o \ - lapic.o ioapic.o preempt.o i8254.o + lapic.o ioapic.o preempt.o i8254.o vtd.o ifeq ($(CONFIG_KVM_TRACE),y) kvm-objs += kvm_trace.o endif |
From: Kay, A. M <all...@in...> - 2008-05-05 21:36:34
|
Kvm kernel changes. Signed-off-by: Allen M Kay <all...@in...> ------ arch/x86/kvm/Makefile | 2 arch/x86/kvm/vtd.c | 183 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 7 + include/asm-x86/kvm_host.h | 3 include/asm-x86/kvm_para.h | 1 include/linux/kvm_host.h | 6 + virt/kvm/kvm_main.c | 3 7 files changed, 204 insertions(+), 1 deletion(-) ------ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c97d35c..b1057fb 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ i8254.o obj-$(CONFIG_KVM) += kvm.o -kvm-intel-objs = vmx.o +kvm-intel-objs = vmx.o vtd.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o kvm-amd-objs = svm.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c new file mode 100644 index 0000000..9a080b5 --- /dev/null +++ b/arch/x86/kvm/vtd.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Author: Allen M. Kay <all...@in...> + * Author: Weidong Han <wei...@in...> + */ + +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/intel-iommu.h> + +//#define DEBUG + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 + +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu); +void iommu_free_domain(struct dmar_domain *domain); +int domain_init(struct dmar_domain *domain, int guest_width); +int domain_context_mapping(struct dmar_domain *d, + struct pci_dev *pdev); +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, + u64 hpa, size_t size, int prot); +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn); +struct dmar_domain * find_domain(struct pci_dev *pdev); + + +int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + unsigned long gpa; + struct page *page; + hpa_t hpa; + int j, write; + struct vm_area_struct *vma; + + if (!kvm->arch.domain) + return 1; + + gpa = base_gfn << PAGE_SHIFT; + page = gfn_to_page(kvm, base_gfn); + hpa = page_to_phys(page); + + printk(KERN_DEBUG "kvm_iommu_map_page: gpa = %lx\n", gpa); + printk(KERN_DEBUG "kvm_iommu_map_page: hpa = %llx\n", hpa); + printk(KERN_DEBUG "kvm_iommu_map_page: size = %lx\n", + npages*PAGE_SIZE); + + for (j = 0; j < npages; j++) { + gpa += PAGE_SIZE; + page = gfn_to_page(kvm, gpa >> PAGE_SHIFT); + hpa = page_to_phys(page); + domain_page_mapping(kvm->arch.domain, gpa, hpa, PAGE_SIZE, + DMA_PTE_READ | DMA_PTE_WRITE); + vma = find_vma(current->mm, gpa); + if (!vma) + return 1; + write = (vma->vm_flags & VM_WRITE) != 0; + get_user_pages(current, current->mm, gpa, + PAGE_SIZE, write, 0, NULL, NULL); + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_iommu_map_pages); + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, status; + for (i = 0; i < kvm->nmemslots; i++) { + status = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + if (status) + return status; + } + return 0; +} + +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_pci_passthrough_dev *pci_pt_dev) +{ + struct dmar_drhd_unit *drhd; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct pci_dev *pdev = NULL; + + printk(KERN_DEBUG "kvm_iommu_map_guest: host bdf = %x:%x:%x\n", + pci_pt_dev->host.busnr, + PCI_SLOT(pci_pt_dev->host.devfn), + PCI_FUNC(pci_pt_dev->host.devfn)); + + for_each_pci_dev(pdev) { + if ((pdev->bus->number == pci_pt_dev->host.busnr) && + (pdev->devfn == pci_pt_dev->host.devfn)) + goto found; + } + goto not_found; +found: + pci_pt_dev->pdev = pdev; + + drhd = dmar_find_matched_drhd_unit(pdev); + if (!drhd) { + printk(KERN_ERR "kvm_iommu_map_guest: drhd == NULL\n"); + goto not_found; + } + + printk(KERN_DEBUG "kvm_iommu_map_guest: reg_base_addr = %llx\n", + drhd->reg_base_addr); + + iommu = drhd->iommu; + if (!iommu) { + printk(KERN_ERR "kvm_iommu_map_guest: iommu == NULL\n"); + goto not_found; + } + domain = iommu_alloc_domain(iommu); + if (!domain) { + printk(KERN_ERR "kvm_iommu_map_guest: domain == NULL\n"); + goto not_found; + } + if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + printk(KERN_ERR "kvm_iommu_map_guest: domain_init() failed\n"); + goto not_found; + } + kvm->arch.domain = domain; + kvm_iommu_map_memslots(kvm); + domain_context_mapping(kvm->arch.domain, pdev); + return 0; +not_found: + return 1; +} +EXPORT_SYMBOL_GPL(kvm_iommu_map_guest); + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct dmar_domain *domain; + struct kvm_pci_pt_dev_list *entry; + struct pci_dev *pdev; + + list_for_each_entry(entry, &kvm->arch.domain->devices, list) { + printk(KERN_DEBUG "kvm_iommu_unmap_guest: %x:%x:%x\n", + entry->pt_dev.host.busnr, + PCI_SLOT(entry->pt_dev.host.devfn), + PCI_FUNC(entry->pt_dev.host.devfn)); + + pdev = entry->pt_dev.pdev; + + if (pdev == NULL) { + printk("kvm_iommu_unmap_guest: pdev == NULL\n"); + return 1; + } + + /* detach kvm dmar domain */ + detach_domain_for_dev(kvm->arch.domain, + pdev->bus->number, pdev->devfn); + + /* now restore back linux iommu domain */ + domain = find_domain(pdev); + if (domain) + domain_context_mapping(domain, pdev); + else + printk(KERN_DEBUG + "kvm_iommu_unmap_guest: domain == NULL\n"); + } + /* unmap guest memory in vt-d page table */ + iommu_free_domain(kvm->arch.domain); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_iommu_unmap_guest); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a97d2e2..a877db2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -257,6 +257,7 @@ static void kvm_free_pci_passthrough(struct kvm *kvm) list_del(&pci_pt_dev->list); } + kvm->arch.domain = NULL; } unsigned long segment_base(u16 selector) @@ -1846,6 +1847,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&pci_pt_dev, argp, sizeof pci_pt_dev)) goto out; + r = kvm_iommu_map_guest(kvm, &pci_pt_dev); + if (r) + goto out; + r = kvm_vm_ioctl_pci_pt_dev(kvm, &pci_pt_dev); if (r) goto out; @@ -4088,6 +4093,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + if (kvm->arch.domain) + kvm_iommu_unmap_guest(kvm); kvm_free_pci_passthrough(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 4662d49..70248cb 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -19,6 +19,8 @@ #include <linux/kvm_types.h> #include <asm/desc.h> +#include <linux/dmar.h> +#include <linux/intel-iommu.h> #define KVM_MAX_VCPUS 16 #define KVM_MEMORY_SLOTS 32 @@ -318,6 +320,7 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head pci_pt_dev_head; + struct dmar_domain *domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 5f93b78..6202ed1 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -170,5 +170,6 @@ struct kvm_pci_pt_info { struct kvm_pci_passthrough_dev { struct kvm_pci_pt_info guest; struct kvm_pci_pt_info host; + struct pci_dev *pdev; /* kernel device pointer for host dev */ }; #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e16682..bcfcf78 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -276,6 +276,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); +int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, + unsigned long npages); +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_pci_passthrough_dev *pci_pt_dev); +int kvm_iommu_unmap_guest(struct kvm *kvm); + static inline void kvm_guest_enter(void) { account_system_vtime(current); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d3cb4cc..e46614a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -309,6 +309,9 @@ int __kvm_set_memory_region(struct kvm *kvm, new.npages = npages; new.flags = mem->flags; + /* map the pages in iommu page table */ + kvm_iommu_map_pages(kvm, base_gfn, npages); + /* Disallow changing a memory slot's size. */ r = -EINVAL; if (npages && old.npa |
From: Kay, A. M <all...@in...> - 2008-05-05 21:35:32
|
Following three patches contains vt-d support for pci passthrough. It contains diff's base on Amit's 4/22 passthrough tree. The hardware environment used for this work is an Intel Weybridge system (Q35). The passthrough device is an E1000 NIC. I'm still using irqhook mechanism for interrupt injection as I had problem with irqchip machanism. Following is the command line I used to start the guest. /usr/local/bin/qemu-system-x86_64 -boot c -hda /etc/xen/fc5_32.img -m 256 -net none -pcidevice e1000/01:00.0-16 -no-kvm-irqchip Remaining tasks include: 1) Generated vtd.o with kvm-intel.ko instead of kvm.ko. 2) Make iommu hooks in generic code to be non-Intel specific Let me know of your feedbacks. Thanks. Allen |
From: Jack S. <st...@sg...> - 2008-05-05 19:46:24
|
On Mon, May 05, 2008 at 08:34:05PM +0200, Andrea Arcangeli wrote: > On Mon, May 05, 2008 at 12:25:06PM -0500, Jack Steiner wrote: > > Agree. My apologies... I should have caught it. > > No problem. > > > __mmu_notifier_register/__mmu_notifier_unregister seems like a better way to > > go, although either is ok. > > If you also like __mmu_notifier_register more I'll go with it. The > bitflags seems like a bit of overkill as I can't see the need of any > other bitflag other than this one and they also can't be removed as > easily in case you'll find a way to call it outside the lock later. > > > Let me finish my testing. At one time, I did not use ->release but > > with all the locking & teardown changes, I need to do some reverification. I finished testing & everything looks good. I do use the ->release callout but mainly as a performance hint that teardown is in progress & that TLB flushing is no longer needed. (GRU TLB entries are tagged with a task-specific ID that will not be reused until a full TLB purge is done. This eliminates the requirement to purge at task-exit.) Normally, a notifier is registered when a GRU segment is mmaped, and unregistered when the segment is unmapped. Well behaved tasks will not have a GRU or a notifier when exit starts. If a task fails to unmap a GRU segment, they still exist at the start of exit. On the ->release callout, I set a flag in the container of my mmu_notifier that exit has started. As VMA are cleaned up, TLB flushes are skipped because of the flag is set. When the GRU VMA is deleted, I free my structure containing the notifier. I _think_ works. Do you see any problems? I should also mention that I have an open-coded function that possibly belongs in mmu_notifier.c. A user is allowed to have multiple GRU segments. Each GRU has a couple of data structures linked to the VMA. All, however, need to share the same notifier. I currently open code a function that scans the notifier list to determine if a GRU notifier already exists. If it does, I update a refcnt & use it. Otherwise, I register a new one. All of this is protected by the mmap_sem. Just in case I mangled the above description, I'll attach a copy of the GRU mmuops code. --- jack |
From: Andrea A. <an...@qu...> - 2008-05-05 18:34:06
|
On Mon, May 05, 2008 at 12:25:06PM -0500, Jack Steiner wrote: > Agree. My apologies... I should have caught it. No problem. > __mmu_notifier_register/__mmu_notifier_unregister seems like a better way to > go, although either is ok. If you also like __mmu_notifier_register more I'll go with it. The bitflags seems like a bit of overkill as I can't see the need of any other bitflag other than this one and they also can't be removed as easily in case you'll find a way to call it outside the lock later. > Let me finish my testing. At one time, I did not use ->release but > with all the locking & teardown changes, I need to do some reverification. If you didn't implement it you shall apply this patch but you shall read carefully the comment I written that covers that usage case. diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -29,10 +29,25 @@ struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are - * freed. It's mandatory to implement this method. This can - * run concurrently with other mmu notifier methods and it + * freed. This can run concurrently with other mmu notifier + * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the - * secondary mmu. + * secondary mmu. If this method isn't implemented you've to + * be sure that nothing could possibly write to the pages + * through the secondary mmu by the time the last thread with + * tsk->mm == mm exits. + * + * As side note: the pages freed after ->release returns could + * be immediately reallocated by the gart at an alias physical + * address with a different cache model, so if ->release isn't + * implemented because all memory accesses through the + * secondary mmu implicitly are terminated by the time the + * last thread of this mm quits, you've also to be sure that + * speculative hardware operations can't allocate dirty + * cachelines in the cpu that could not be snooped and made + * coherent with the other read and write operations happening + * through the gart alias address, leading to memory + * corruption. */ void (*release)(struct mmu_notifier *mn, struct mm_struct *mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -59,7 +59,8 @@ void __mmu_notifier_release(struct mm_st * from establishing any more sptes before all the * pages in the mm are freed. */ - mn->ops->release(mn, mm); + if (mn->ops->release) + mn->ops->release(mn, mm); srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); spin_lock(&mm->mmu_notifier_mm->lock); } @@ -251,7 +252,8 @@ void mmu_notifier_unregister(struct mmu_ * guarantee ->release is called before freeing the * pages. */ - mn->ops->release(mn, mm); + if (mn->ops->release) + mn->ops->release(mn, mm); srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } else spin_unlock(&mm->mmu_notifier_mm->lock); |
From: Hollis B. <ho...@us...> - 2008-05-05 18:00:45
|
On Monday 05 May 2008 11:04:52 Jerone Young wrote: > These patches fell through the cracks. > > This set of patches fixes setting memory for PowerPC bamboo board model. Besides just setting memory in qemu, you must also set it in the device tree. This sets the memory in the device tree so that it can be something other then the hard coded memory size of 144MB. > > Signed-off-by: Jerone Young <jy...@us...> Acked-by: Hollis Blanchard <ho...@us...> Avi, please apply to kvm-userspace; thanks. -- Hollis Blanchard IBM Linux Technology Center |
From: Jack S. <st...@sg...> - 2008-05-05 17:25:11
|
On Mon, May 05, 2008 at 07:14:34PM +0200, Andrea Arcangeli wrote: > On Mon, May 05, 2008 at 11:21:13AM -0500, Jack Steiner wrote: > > The GRU does the registration/deregistration of mmu notifiers from mmap/munmap. > > At this point, the mmap_sem is already held writeable. I hit a deadlock > > in mm_lock. > > It'd been better to know about this detail earlier, Agree. My apologies... I should have caught it. > but frankly this > is a minor problem, the important thing is we all agree together on > the more difficult parts ;). > > > A quick fix would be to do one of the following: > > > > - move the mmap_sem locking to the caller of the [de]registration routines. > > Since the first/last thing done in mm_lock/mm_unlock is to > > acquire/release mmap_sem, this change does not cause major changes. > > I don't like this solution very much. Nor GRU nor KVM will call > mmu_notifier_register inside the mmap_sem protected sections, so I > think the default mmu_notifier_register should be smp safe by itself > without requiring additional locks to be artificially taken externally > (especially because the need for mmap_sem in write mode is a very > mmu_notifier internal detail). > > > - add a flag to mmu_notifier_[un]register routines to indicate > > if mmap_sem is already locked. > > The interface would change like this: > > #define MMU_NOTIFIER_REGISTER_MMAP_SEM (1<<0) > void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, > unsigned long mmu_notifier_flags); That works... > > A third solution is to add: > > /* > * This must can be called instead of mmu_notifier_register after > * taking the mmap_sem in write mode (read mode isn't enough). > */ > void __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); > > Do you still prefer the bitflag or you prefer > __mmu_notifier_register. It's ok either ways, except > __mmu_notifier_reigster could be removed in a backwards compatible > way, the bitflag can't. > > > I've temporarily deleted the mm_lock locking of mmap_sem and am continuing to > > test. More later.... __mmu_notifier_register/__mmu_notifier_unregister seems like a better way to go, although either is ok. > > Sure! In the meantime go ahead this way. > > Another very minor change I've been thinking about is to make > ->release not mandatory. It happens that with KVM ->release isn't > strictly required because after mm_users reaches 0, no guest could > possibly run anymore. So I'm using ->release only for debugging by > placing -1UL in the root shadow pagetable, to be sure ;). So because > at least one user won't strictly require ->release being consistent in > having all method optional may be nicer. Alternatively we could make > them all mandatory and if somebody doesn't need one of the methods it > should implement it as a dummy function. Both ways have pros and cons, > but they don't make any difference to us in practice. If I've to > change the patch for the mmap_sem taken during registration I may as > well cleanup this minor bit. Let me finish my testing. At one time, I did not use ->release but with all the locking & teardown changes, I need to do some reverification. --- jack |
From: Andrea A. <an...@qu...> - 2008-05-05 17:14:35
|
On Mon, May 05, 2008 at 11:21:13AM -0500, Jack Steiner wrote: > The GRU does the registration/deregistration of mmu notifiers from mmap/munmap. > At this point, the mmap_sem is already held writeable. I hit a deadlock > in mm_lock. It'd been better to know about this detail earlier, but frankly this is a minor problem, the important thing is we all agree together on the more difficult parts ;). > A quick fix would be to do one of the following: > > - move the mmap_sem locking to the caller of the [de]registration routines. > Since the first/last thing done in mm_lock/mm_unlock is to > acquire/release mmap_sem, this change does not cause major changes. I don't like this solution very much. Nor GRU nor KVM will call mmu_notifier_register inside the mmap_sem protected sections, so I think the default mmu_notifier_register should be smp safe by itself without requiring additional locks to be artificially taken externally (especially because the need for mmap_sem in write mode is a very mmu_notifier internal detail). > - add a flag to mmu_notifier_[un]register routines to indicate > if mmap_sem is already locked. The interface would change like this: #define MMU_NOTIFIER_REGISTER_MMAP_SEM (1<<0) void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long mmu_notifier_flags); A third solution is to add: /* * This must can be called instead of mmu_notifier_register after * taking the mmap_sem in write mode (read mode isn't enough). */ void __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); Do you still prefer the bitflag or you prefer __mmu_notifier_register. It's ok either ways, except __mmu_notifier_reigster could be removed in a backwards compatible way, the bitflag can't. > I've temporarily deleted the mm_lock locking of mmap_sem and am continuing to > test. More later.... Sure! In the meantime go ahead this way. Another very minor change I've been thinking about is to make ->release not mandatory. It happens that with KVM ->release isn't strictly required because after mm_users reaches 0, no guest could possibly run anymore. So I'm using ->release only for debugging by placing -1UL in the root shadow pagetable, to be sure ;). So because at least one user won't strictly require ->release being consistent in having all method optional may be nicer. Alternatively we could make them all mandatory and if somebody doesn't need one of the methods it should implement it as a dummy function. Both ways have pros and cons, but they don't make any difference to us in practice. If I've to change the patch for the mmap_sem taken during registration I may as well cleanup this minor bit. Also note the rculist.h patch you sent earlier won't work against mainline so I can't incorporate it in my patchset, Andrew will have to apply it as mmu-notifier-core-mm after incorporating mmu-notifier-core into -mm. Until a new update is released, mmu-notifier-core v15 remains ok for merging, no known bugs, here we're talking about a new and simple feature and a tiny cleanup that nobody can notice anyway. |
From: Chris L. <cla...@re...> - 2008-05-05 17:07:51
|
Attached is a patch that fixes a guest crash when booting older Linux kernels. The problem stems from the fact that we are currently emulating MSR_K7_EVNTSEL[0-3], but not emulating MSR_K7_PERFCTR[0-3]. Because of this, setup_k7_watchdog() in the Linux kernel receives a GPF when it attempts to write into MSR_K7_PERFCTR, which causes an OOPs. The patch fixes it by just "fake" emulating the appropriate MSRs, throwing away the data in the process. This causes the NMI watchdog to not actually work, but it's not such a big deal in a virtualized environment. When we get a write to one of these counters, we printk_ratelimit() a warning. I decided to print it out for all writes, even if the data is 0; it doesn't seem to make sense to me to special case when data == 0. Tested by myself on a RHEL-4 guest, and Joerg Roedel on a Windows XP 64-bit guest. Signed-off-by: Chris Lalancette <cla...@re...> |
From: Jack S. <st...@sg...> - 2008-05-05 16:21:25
|
On Fri, May 02, 2008 at 05:05:04PM +0200, Andrea Arcangeli wrote: > # HG changeset patch > # User Andrea Arcangeli <an...@qu...> > # Date 1209740175 -7200 > # Node ID 1489529e7b53d3f2dab8431372aa4850ec821caa > # Parent 5026689a3bc323a26d33ad882c34c4c9c9a3ecd8 > mmu-notifier-core I upgraded to the latest mmu notifier patch & hit a deadlock. (Sorry - I should have seen this earlier but I haven't tracked the last couple of patches). The GRU does the registration/deregistration of mmu notifiers from mmap/munmap. At this point, the mmap_sem is already held writeable. I hit a deadlock in mm_lock. A quick fix would be to do one of the following: - move the mmap_sem locking to the caller of the [de]registration routines. Since the first/last thing done in mm_lock/mm_unlock is to acquire/release mmap_sem, this change does not cause major changes. - add a flag to mmu_notifier_[un]register routines to indicate if mmap_sem is already locked. I've temporarily deleted the mm_lock locking of mmap_sem and am continuing to test. More later.... --- jack |
From: Jerone Y. <jy...@us...> - 2008-05-05 16:07:38
|
# HG changeset patch # User Jerone Young <jy...@us...> # Date 1210003411 18000 # Branch merge # Node ID c455452c9b217abed8a2e6147bbeb91f33ff1799 # Parent cf3ccc3add69052aade695c746151b1cb8812252 Fix memory defined in device tree by declaring it dynamically for bamboo board model This fixes a issue where the amount of memory is not properly being defined in the device tree. It currently is hardcoded for 144MB. The result is that if you specify a memory size below the hardcoded size, the guest crashes. This patch now dynamically changes the device tree to the memory value specified. Signed-off-by: Jerone Young <jy...@us...> diff --git a/qemu/hw/ppc440_bamboo.c b/qemu/hw/ppc440_bamboo.c --- a/qemu/hw/ppc440_bamboo.c +++ b/qemu/hw/ppc440_bamboo.c @@ -50,6 +50,7 @@ void bamboo_init(ram_addr_t ram_size, in int i=0, k=0; uint32_t cpu_freq; uint32_t timebase_freq; + uint32_t mem_reg_property[]={0, 0, ram_size}; printf("%s: START\n", __func__); @@ -73,6 +74,7 @@ void bamboo_init(ram_addr_t ram_size, in printf("WARNING: %i MB left over memory is ram\n", bytes_to_mb((int)tmp_ram_size)); ram_size -= tmp_ram_size; + mem_reg_property[2] = ram_size; } /* Setup CPU */ @@ -159,6 +161,8 @@ void bamboo_init(ram_addr_t ram_size, in /* manipulate device tree in memory */ dt_cell(fdt, "/cpus/cpu@0", "clock-frequency", cpu_freq); dt_cell(fdt, "/cpus/cpu@0", "timebase-frequency", timebase_freq); + dt_cell_multi(fdt, "/memory", "reg", mem_reg_property, + sizeof(mem_reg_property)); dt_cell(fdt, "/chosen", "linux,initrd-start", initrd_base); dt_cell(fdt, "/chosen", "linux,initrd-end", (initrd_base + initrd_size)); |
From: Jerone Y. <jy...@us...> - 2008-05-05 16:07:34
|
# HG changeset patch # User Jerone Young <jy...@us...> # Date 1210003408 18000 # Branch merge # Node ID cf3ccc3add69052aade695c746151b1cb8812252 # Parent 97e439fdd4e91c3fb1ef9055f073add55084d69f Add function dt_cell_multi to hw/device_tree.c This patch adds function dt_cell_multi to allow for manipulation of device tree properties that contain mulitiple 32bit values. Signed-off-by: Jerone Young <jy...@us...> diff --git a/qemu/hw/device_tree.c b/qemu/hw/device_tree.c --- a/qemu/hw/device_tree.c +++ b/qemu/hw/device_tree.c @@ -162,6 +162,21 @@ void dt_cell(void *fdt, char *node_path, } } +/* This function is to manipulate a cell with multiple values */ +void dt_cell_multi(void *fdt, char *node_path, char *property, + uint32_t *val_array, int size) +{ + int offset; + int ret; + offset = get_offset_of_node(fdt, node_path); + ret = fdt_setprop(fdt, offset, property, val_array, size); + if (ret < 0) { + printf("Unable to set device tree property '%s'\n", + property); + exit(1); + } +} + void dt_string(void *fdt, char *node_path, char *property, char *string) { diff --git a/qemu/hw/device_tree.h b/qemu/hw/device_tree.h --- a/qemu/hw/device_tree.h +++ b/qemu/hw/device_tree.h @@ -19,6 +19,8 @@ void dump_device_tree_to_file(void *fdt, void dump_device_tree_to_file(void *fdt, char *filename); void dt_cell(void *fdt, char *node_path, char *property, uint32_t val); +void dt_cell_multi(void *fdt, char *node_path, char *property, + uint32_t *val_array, int size); void dt_string(void *fdt, char *node_path, char *property, char *string); void dt_node(void *fdt, char *node_parent_path, char *name); |
From: Jerone Y. <jy...@us...> - 2008-05-05 16:07:28
|
These patches fell through the cracks. This set of patches fixes setting memory for PowerPC bamboo board model. Besides just setting memory in qemu, you must also set it in the device tree. This sets the memory in the device tree so that it can be something other then the hard coded memory size of 144MB. Signed-off-by: Jerone Young <jy...@us...> |
From: Marcelo T. <mto...@re...> - 2008-05-05 15:29:55
|
On Mon, May 05, 2008 at 09:47:59AM +0200, Gerd Hoffmann wrote: > Marcelo Tosatti wrote: > > On Thu, Apr 24, 2008 at 10:37:04AM +0200, Gerd Hoffmann wrote: > >> Hi folks, > >> > >> My first attempt to send out a patch series with git ... > >> > >> The patches fix the kvm paravirt clocksource code to be compatible with > >> xen and they also factor out some code which can be shared into a > >> separate source files used by both kvm and xen. > > > > The issue with SMP guests is still present. Booting with "nohz=off" resolves it. > > > > Same symptoms as before, apic_timer_fn for one of the vcpu's is ticking way slower > > than the remaining ones: > > > > [root@localhost ~]# cat /proc/timer_stats | grep apic > > 391, 4125 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > 2103, 4126 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > 1896, 4127 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > 1857, 4128 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > What userspace version is this? With iothread support? Or older one > where the vcpu0 thread also handles all the I/O? Is 4x neeed to > reproduce or do you see it with 2x too? What host? F8 host, recent kvm-userspace.git (so with IO thread), recent kvm.git (plus your patches), haven't tried 2x but I think 4x is not necessary to reproduce the problem. > A quick test with xenner (which has a separate I/O thread) didn't show > anything unusual. Going investigate ... Give a pure kvm guest a try, its pretty easy to reproduce. |
From: Avi K. <av...@qu...> - 2008-05-05 14:31:35
|
Daniel P. Berrange wrote: > I'm forwarding this patch from upstream QEMU because its impotant to get > this fixed in KVM to make serial console installs usable now libvirt can > talk to KVM serial ports over PTYs. > > It was reported in this thread: > > http://lists.gnu.org/archive/html/qemu-devel/2008-05/msg00014.html > > With the final verson of the patch here: > > http://lists.gnu.org/archive/html/qemu-devel/2008-05/msg00135.html > > Recently committed to SVN > > http://svn.savannah.nongnu.org/viewvc?view=rev&root=qemu&revision=4338 > > > [...] I've just merged qemu-svn into kvm, unfortunately two revisions short of 4338. Once my tree passes the regression tests, I'll merge again so this patch is included. -- error compiling committee.c: too many arguments to function |
From: Avi K. <av...@qu...> - 2008-05-05 14:27:52
|
Anthony Liguori wrote: >> >> Please break the SIGUSR1 changes into a separate patch. Ditto with >> *fd syscall compat. > > Done. I didn't make the syscall compat stuff separate patches because > that would break bisect on older hosts. However, I did split it up > logically between the remove sigusr1 patch and the signalfd patch. > Not if you placed the compat patch before the use (e.g. a patch which adds kvm_signalfd() but no uses). -- error compiling committee.c: too many arguments to function |
From: Anthony L. <ali...@us...> - 2008-05-05 14:00:39
|
It's a little odd to use signals to raise a notification on a file descriptor when we can just work directly with a file descriptor instead. This patch converts the SIGUSR1 based notification in the io-thread to instead use an eventfd file descriptor. If eventfd isn't available, we use a pipe() instead. The benefit of using eventfd is that multiple notifications will be batched into a signal IO event. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 2316c92..db6912e 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -203,7 +203,7 @@ CPPFLAGS+=-I$(SRC_PATH)/tcg/sparc endif ifeq ($(USE_KVM), 1) -LIBOBJS+=qemu-kvm.o +LIBOBJS+=qemu-kvm.o kvm-compatfd.o endif ifdef CONFIG_SOFTFLOAT LIBOBJS+=fpu/softfloat.o diff --git a/qemu/kvm-compatfd.c b/qemu/kvm-compatfd.c new file mode 100644 index 0000000..1b030ba --- /dev/null +++ b/qemu/kvm-compatfd.c @@ -0,0 +1,33 @@ +/* + * signalfd/eventfd compatibility + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <ali...@us...> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "qemu-kvm.h" + +#include <sys/syscall.h> + +int kvm_eventfd(int *fds) +{ +#if defined(SYS_eventfd) + int ret; + + ret = syscall(SYS_eventfd, 0); + if (ret >= 0) { + fds[0] = fds[1] = ret; + return 0; + } else if (!(ret == -1 && errno == ENOSYS)) + return ret; +#endif + + return pipe(fds); +} diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index 9a9bf59..7134e56 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -15,6 +15,8 @@ int kvm_pit = 1; #include <string.h> #include "hw/hw.h" #include "sysemu.h" +#include "qemu-common.h" +#include "console.h" #include "qemu-kvm.h" #include <libkvm.h> @@ -61,6 +63,7 @@ struct vcpu_info { } vcpu_info[256]; pthread_t io_thread; +static int io_thread_fd = -1; static inline unsigned long kvm_get_thread_id(void) { @@ -213,7 +216,7 @@ static int kvm_eat_signal(struct qemu_kvm_signal_table *waitset, CPUState *env, if (env && vcpu_info[env->cpu_index].stop) { vcpu_info[env->cpu_index].stop = 0; vcpu_info[env->cpu_index].stopped = 1; - pthread_kill(io_thread, SIGUSR1); + qemu_kvm_notify_work(); } pthread_mutex_unlock(&qemu_mutex); @@ -418,7 +421,6 @@ static void qemu_kvm_init_signal_tables(void) kvm_add_signal(&io_signal_table, SIGIO); kvm_add_signal(&io_signal_table, SIGALRM); - kvm_add_signal(&io_signal_table, SIGUSR1); kvm_add_signal(&io_signal_table, SIGUSR2); kvm_add_signal(&vcpu_signal_table, SIG_IPI); @@ -440,8 +442,51 @@ int kvm_init_ap(void) void qemu_kvm_notify_work(void) { - if (io_thread) - pthread_kill(io_thread, SIGUSR1); + uint64_t value = 1; + char buffer[8]; + size_t offset = 0; + + if (io_thread_fd == -1) + return; + + memcpy(buffer, &value, sizeof(value)); + + while (offset < 8) { + ssize_t len; + + len = write(io_thread_fd, buffer + offset, 8 - offset); + if (len == -1 && errno == EINTR) + continue; + + if (len <= 0) + break; + + offset += len; + } + + if (offset != 8) + fprintf(stderr, "failed to notify io thread\n"); +} + +/* Used to break IO thread out of select */ +static void io_thread_wakeup(void *opaque) +{ + int fd = (unsigned long)opaque; + char buffer[8]; + size_t offset = 0; + + while (offset < 8) { + ssize_t len; + + len = read(fd, buffer + offset, 8 - offset); + if (len == -1 && errno == EINTR) + continue; + + if (len <= 0) + break; + + offset += len; + } } /* @@ -452,8 +497,20 @@ void qemu_kvm_notify_work(void) int kvm_main_loop(void) { + int fds[2]; + io_thread = pthread_self(); qemu_system_ready = 1; + + if (kvm_eventfd(fds) == -1) { + fprintf(stderr, "failed to create eventfd\n"); + return -errno; + } + + qemu_set_fd_handler2(fds[0], NULL, io_thread_wakeup, NULL, + (void *)(unsigned long)fds[0]); + + io_thread_fd = fds[1]; pthread_mutex_unlock(&qemu_mutex); pthread_cond_broadcast(&qemu_system_cond); diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h index 024a653..8cd63e6 100644 --- a/qemu/qemu-kvm.h +++ b/qemu/qemu-kvm.h @@ -97,4 +97,6 @@ extern kvm_context_t kvm_context; #define qemu_kvm_pit_in_kernel() (0) #endif +int kvm_eventfd(int *fds); + #endif |
From: Anthony L. <ali...@us...> - 2008-05-05 13:48:20
|
The select() in the IO thread may wait a long time before rebuilding the fd set. Whenever we do something that changes the fd set, we should interrupt the IO thread. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/vl.c b/qemu/vl.c index 1192759..e9f0ca4 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -260,6 +260,16 @@ static int event_pending = 1; #define TFR(expr) do { if ((expr) != -1) break; } while (errno == EINTR) +/* KVM runs the main loop in a separate thread. If we update one of the lists + * that are polled before or after select(), we need to make sure to break out + * of the select() to ensure the new item is serviced. + */ +static void main_loop_break(void) +{ + if (kvm_enabled()) + qemu_kvm_notify_work(); +} + void decorate_application_name(char *appname, int max_len) { if (kvm_enabled()) @@ -5680,6 +5690,7 @@ int qemu_set_fd_handler2(int fd, ioh->opaque = opaque; ioh->deleted = 0; } + main_loop_break(); return 0; } @@ -7606,8 +7617,7 @@ void qemu_bh_schedule(QEMUBH *bh) if (env) { cpu_interrupt(env, CPU_INTERRUPT_EXIT); } - if (kvm_enabled()) - qemu_kvm_notify_work(); + main_loop_break(); } void qemu_bh_cancel(QEMUBH *bh) |
From: Anthony L. <ali...@us...> - 2008-05-05 13:47:44
|
This patch reworks the IO thread to use signalfd() instead of sigtimedwait(). This will eliminate the need to use SIGIO everywhere. In this version of the patch, we use signalfd() when it's available. When it isn't available, we create a separate thread and use sigwaitinfo() to simulate signalfd(). I've tested Windows and Linux guests with SMP without seeing an obvious regressions. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/kvm-compatfd.c b/qemu/kvm-compatfd.c index 1b030ba..3c2be28 100644 --- a/qemu/kvm-compatfd.c +++ b/qemu/kvm-compatfd.c @@ -15,6 +15,100 @@ #include "qemu-kvm.h" #include <sys/syscall.h> +#include <pthread.h> + +struct sigfd_compat_info +{ + sigset_t mask; + int fd; +}; + +static void *sigwait_compat(void *opaque) +{ + struct sigfd_compat_info *info = opaque; + int err; + + sigprocmask(SIG_BLOCK, &info->mask, NULL); + + do { + siginfo_t siginfo; + + kvm_sleep_begin(); + err = sigwaitinfo(&info->mask, &siginfo); + kvm_sleep_end(); + + if (err == -1 && errno == EINTR) + continue; + + if (err > 0) { + char buffer[128]; + size_t offset = 0; + + memcpy(buffer, &err, sizeof(err)); + while (offset < sizeof(buffer)) { + ssize_t len; + + len = write(info->fd, buffer + offset, + sizeof(buffer) - offset); + if (len == -1 && errno == EINTR) + continue; + + if (len <= 0) { + err = -1; + break; + } + + offset += len; + } + } + } while (err >= 0); + + return NULL; +} + +static int kvm_signalfd_compat(const sigset_t *mask) +{ + pthread_attr_t attr; + pthread_t tid; + struct sigfd_compat_info *info; + int fds[2]; + + info = malloc(sizeof(*info)); + if (info == NULL) { + errno = ENOMEM; + return -1; + } + + if (pipe(fds) == -1) { + free(info); + return -1; + } + + memcpy(&info->mask, mask, sizeof(*mask)); + info->fd = fds[1]; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&tid, &attr, sigwait_compat, info); + + pthread_attr_destroy(&attr); + + return fds[0]; +} + +int kvm_signalfd(const sigset_t *mask) +{ +#if defined(SYS_signalfd) + int ret; + + ret = syscall(SYS_signalfd, -1, mask, _NSIG / 8); + if (!(ret == -1 && errno == ENOSYS)) + return ret; +#endif + + return kvm_signalfd_compat(mask); +} int kvm_eventfd(int *fds) { diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index 7134e56..0ea03f8 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -12,6 +12,9 @@ int kvm_allowed = 1; int kvm_irqchip = 1; int kvm_pit = 1; +#include "qemu-common.h" +#include "console.h" + #include <string.h> #include "hw/hw.h" #include "sysemu.h" @@ -40,14 +43,6 @@ __thread struct vcpu_info *vcpu; static int qemu_system_ready; -struct qemu_kvm_signal_table { - sigset_t sigset; - sigset_t negsigset; -}; - -static struct qemu_kvm_signal_table io_signal_table; -static struct qemu_kvm_signal_table vcpu_signal_table; - #define SIG_IPI (SIGRTMIN+4) struct vcpu_info { @@ -172,37 +167,23 @@ static int has_work(CPUState *env) return kvm_arch_has_work(env); } -static int kvm_process_signal(int si_signo) -{ - struct sigaction sa; - - switch (si_signo) { - case SIGUSR2: - pthread_cond_signal(&qemu_aio_cond); - break; - case SIGALRM: - case SIGIO: - sigaction(si_signo, NULL, &sa); - sa.sa_handler(si_signo); - break; - } - - return 1; -} - -static int kvm_eat_signal(struct qemu_kvm_signal_table *waitset, CPUState *env, - int timeout) +static int kvm_eat_signal(CPUState *env, int timeout) { struct timespec ts; int r, e, ret = 0; siginfo_t siginfo; + sigset_t waitset; ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; - r = sigtimedwait(&waitset->sigset, &siginfo, &ts); + sigemptyset(&waitset); + sigaddset(&waitset, SIG_IPI); + + r = sigtimedwait(&waitset, &siginfo, &ts); if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout) return 0; e = errno; + pthread_mutex_lock(&qemu_mutex); if (env && vcpu) cpu_single_env = vcpu->env; @@ -211,7 +192,7 @@ static int kvm_eat_signal(struct qemu_kvm_signal_table *waitset, CPUState *env, exit(1); } if (r != -1) - ret = kvm_process_signal(siginfo.si_signo); + ret = 1; if (env && vcpu_info[env->cpu_index].stop) { vcpu_info[env->cpu_index].stop = 0; @@ -227,14 +208,13 @@ static int kvm_eat_signal(struct qemu_kvm_signal_table *waitset, CPUState *env, static void kvm_eat_signals(CPUState *env, int timeout) { int r = 0; - struct qemu_kvm_signal_table *waitset = &vcpu_signal_table; - while (kvm_eat_signal(waitset, env, 0)) + while (kvm_eat_signal(env, 0)) r = 1; if (!r && timeout) { - r = kvm_eat_signal(waitset, env, timeout); + r = kvm_eat_signal(env, timeout); if (r) - while (kvm_eat_signal(waitset, env, 0)) + while (kvm_eat_signal(env, 0)) ; } } @@ -267,9 +247,7 @@ static void pause_all_threads(void) pthread_kill(vcpu_info[i].thread, SIG_IPI); } while (!all_threads_paused()) { - pthread_mutex_unlock(&qemu_mutex); - kvm_eat_signal(&io_signal_table, NULL, 1000); - pthread_mutex_lock(&qemu_mutex); + main_loop_wait(1000); cpu_single_env = NULL; } } @@ -310,6 +288,12 @@ static void setup_kernel_sigmask(CPUState *env) { sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGUSR2); + sigaddset(&set, SIGIO); + sigaddset(&set, SIGALRM); + sigprocmask(SIG_BLOCK, &set, NULL); + sigprocmask(SIG_BLOCK, NULL, &set); sigdelset(&set, SIG_IPI); @@ -346,7 +330,7 @@ static int kvm_main_loop_cpu(CPUState *env) cpu_single_env = env; while (1) { while (!has_work(env)) - kvm_main_loop_wait(env, 10); + kvm_main_loop_wait(env, 1000); if (env->interrupt_request & CPU_INTERRUPT_HARD) env->hflags &= ~HF_HALTED_MASK; if (!kvm_irqchip_in_kernel(kvm_context) && info->sipi_needed) @@ -394,18 +378,6 @@ static void *ap_main_loop(void *_env) return NULL; } -static void qemu_kvm_init_signal_table(struct qemu_kvm_signal_table *sigtab) -{ - sigemptyset(&sigtab->sigset); - sigfillset(&sigtab->negsigset); -} - -static void kvm_add_signal(struct qemu_kvm_signal_table *sigtab, int signum) -{ - sigaddset(&sigtab->sigset, signum); - sigdelset(&sigtab->negsigset, signum); -} - void kvm_init_new_ap(int cpu, CPUState *env) { pthread_create(&vcpu_info[cpu].thread, NULL, ap_main_loop, env); @@ -414,27 +386,12 @@ void kvm_init_new_ap(int cpu, CPUState *env) pthread_cond_wait(&qemu_vcpu_cond, &qemu_mutex); } -static void qemu_kvm_init_signal_tables(void) -{ - qemu_kvm_init_signal_table(&io_signal_table); - qemu_kvm_init_signal_table(&vcpu_signal_table); - - kvm_add_signal(&io_signal_table, SIGIO); - kvm_add_signal(&io_signal_table, SIGALRM); - kvm_add_signal(&io_signal_table, SIGUSR2); - - kvm_add_signal(&vcpu_signal_table, SIG_IPI); - - sigprocmask(SIG_BLOCK, &io_signal_table.sigset, NULL); -} - int kvm_init_ap(void) { #ifdef TARGET_I386 kvm_tpr_opt_setup(); #endif qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL); - qemu_kvm_init_signal_tables(); signal(SIG_IPI, sig_ipi_handler); return 0; @@ -468,6 +425,61 @@ void qemu_kvm_notify_work(void) fprintf(stderr, "failed to notify io thread\n"); } +static int received_signal; + +/* QEMU relies on periodically breaking out of select via EINTR to poll for IO + and timer signals. Since we're now using a file descriptor to handle + signals, select() won't be interrupted by a signal. We need to forcefully + break the select() loop when a signal is received hence + kvm_check_received_signal(). */ + +int kvm_check_received_signal(void) +{ + if (received_signal) { + received_signal = 0; + return 1; + } + + return 0; +} + +/* If we have signalfd, we mask out the signals we want to handle and then + * use signalfd to listen for them. We rely on whatever the current signal + * handler is to dispatch the signals when we receive them. + */ + +static void sigfd_handler(void *opaque) +{ + int fd = (unsigned long)opaque; + struct signalfd_siginfo info; + struct sigaction action; + ssize_t len; + + while (1) { + do { + len = read(fd, &info, sizeof(info)); + } while (len == -1 && errno == EINTR); + + if (len == -1 && errno == EAGAIN) + break; + + if (len != sizeof(info)) { + printf("read from sigfd returned %ld: %m\n", len); + return; + } + + sigaction(info.ssi_signo, NULL, &action); + if (action.sa_handler) + action.sa_handler(info.ssi_signo); + + if (info.ssi_signo == SIGUSR2) { + pthread_cond_signal(&qemu_aio_cond); + } + } + + received_signal = 1; +} + /* Used to break IO thread out of select */ static void io_thread_wakeup(void *opaque) { @@ -487,17 +499,15 @@ static void io_thread_wakeup(void *opaque) offset += len; } -} -/* - * The IO thread has all signals that inform machine events - * blocked (io_signal_table), so it won't get interrupted - * while processing in main_loop_wait(). - */ + received_signal = 1; +} int kvm_main_loop(void) { int fds[2]; + sigset_t mask; + int sigfd; io_thread = pthread_self(); qemu_system_ready = 1; @@ -511,15 +521,30 @@ int kvm_main_loop(void) (void *)(unsigned long)fds[0]); io_thread_fd = fds[1]; - pthread_mutex_unlock(&qemu_mutex); + + sigemptyset(&mask); + sigaddset(&mask, SIGIO); + sigaddset(&mask, SIGALRM); + sigaddset(&mask, SIGUSR2); + sigprocmask(SIG_BLOCK, &mask, NULL); + + sigfd = kvm_signalfd(&mask); + if (sigfd == -1) { + fprintf(stderr, "failed to create signalfd\n"); + return -errno; + } + + fcntl(sigfd, F_SETFL, O_NONBLOCK); + + qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL, + (void *)(unsigned long)sigfd); pthread_cond_broadcast(&qemu_system_cond); + cpu_single_env = NULL; + while (1) { - kvm_eat_signal(&io_signal_table, NULL, 1000); - pthread_mutex_lock(&qemu_mutex); - cpu_single_env = NULL; - main_loop_wait(0); + main_loop_wait(1000); if (qemu_shutdown_requested()) break; else if (qemu_powerdown_requested()) @@ -528,7 +553,6 @@ int kvm_main_loop(void) pthread_kill(vcpu_info[0].thread, SIG_IPI); qemu_kvm_reset_requested = 1; } - pthread_mutex_unlock(&qemu_mutex); } pause_all_threads(); @@ -891,10 +915,7 @@ void qemu_kvm_aio_wait(void) CPUState *cpu_single = cpu_single_env; if (!cpu_single_env) { - pthread_mutex_unlock(&qemu_mutex); - kvm_eat_signal(&io_signal_table, NULL, 1000); - pthread_mutex_lock(&qemu_mutex); - cpu_single_env = NULL; + main_loop_wait(1000); } else { pthread_cond_wait(&qemu_aio_cond, &qemu_mutex); cpu_single_env = cpu_single; @@ -921,3 +942,14 @@ void kvm_cpu_destroy_phys_mem(target_phys_addr_t start_addr, { kvm_destroy_phys_mem(kvm_context, start_addr, size); } + +void kvm_mutex_unlock(void) +{ + pthread_mutex_unlock(&qemu_mutex); +} + +void kvm_mutex_lock(void) +{ + pthread_mutex_lock(&qemu_mutex); + cpu_single_env = NULL; +} diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h index 8cd63e6..e1e461a 100644 --- a/qemu/qemu-kvm.h +++ b/qemu/qemu-kvm.h @@ -10,6 +10,8 @@ #include "cpu.h" +#include <signal.h> + int kvm_main_loop(void); int kvm_qemu_init(void); int kvm_qemu_create_context(void); @@ -97,6 +99,40 @@ extern kvm_context_t kvm_context; #define qemu_kvm_pit_in_kernel() (0) #endif +void kvm_mutex_unlock(void); +void kvm_mutex_lock(void); + +static inline void kvm_sleep_begin(void) +{ + if (kvm_enabled()) + kvm_mutex_unlock(); +} + +static inline void kvm_sleep_end(void) +{ + if (kvm_enabled()) + kvm_mutex_lock(); +} + +int kvm_check_received_signal(void); + +static inline int kvm_received_signal(void) +{ + if (kvm_enabled()) + return kvm_check_received_signal(); + return 0; +} + +#if !defined(SYS_signalfd) +struct signalfd_siginfo { + uint32_t ssi_signo; + uint8_t pad[124]; +}; +#else +#include <linux/signalfd.h> +#endif + +int kvm_signalfd(const sigset_t *mask); int kvm_eventfd(int *fds); #endif diff --git a/qemu/vl.c b/qemu/vl.c index 74be059..1192759 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -7836,6 +7836,23 @@ void qemu_system_powerdown_request(void) cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT); } +static int qemu_select(int max_fd, fd_set *rfds, fd_set *wfds, fd_set *xfds, + struct timeval *tv) +{ + int ret; + + /* KVM holds a mutex while QEMU code is running, we need hooks to + release the mutex whenever QEMU code sleeps. */ + + kvm_sleep_begin(); + + ret = select(max_fd, rfds, wfds, xfds, tv); + + kvm_sleep_end(); + + return ret; +} + void main_loop_wait(int timeout) { IOHandlerRecord *ioh; @@ -7907,11 +7924,12 @@ void main_loop_wait(int timeout) } } - tv.tv_sec = 0; #ifdef _WIN32 + tv.tv_sec = 0; tv.tv_usec = 0; #else - tv.tv_usec = timeout * 1000; + tv.tv_sec = timeout / 1000; + tv.tv_usec = (timeout % 1000) * 1000; #endif #if defined(CONFIG_SLIRP) if (slirp_inited) { @@ -7919,7 +7937,7 @@ void main_loop_wait(int timeout) } #endif moreio: - ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv); + ret = qemu_select(nfds + 1, &rfds, &wfds, &xfds, &tv); if (ret > 0) { IOHandlerRecord **pioh; int more = 0; @@ -7948,7 +7966,7 @@ void main_loop_wait(int timeout) } else pioh = &ioh->next; } - if (more) + if (more && !kvm_received_signal()) goto moreio; } #if defined(CONFIG_SLIRP) |
From: Anthony L. <ali...@us...> - 2008-05-05 13:47:39
|
QEMU is rather aggressive about exhausting the wait period when selecting. This is fine when the wait period is low and when there is significant delays in-between selects as it improves IO throughput. With the IO thread, there is a very small delay between selects and our wait period for select is very large. This patch changes main_loop_wait to only select once before doing the various other things in the main loop. This generally improves responsiveness of things like SDL but also improves individual file descriptor throughput quite dramatically. Signed-off-by: Anthony Liguori <ali...@us...> diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index e16b261..6a90e68 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -423,24 +423,6 @@ void qemu_kvm_notify_work(void) fprintf(stderr, "failed to notify io thread\n"); } -static int received_signal; - -/* QEMU relies on periodically breaking out of select via EINTR to poll for IO - and timer signals. Since we're now using a file descriptor to handle - signals, select() won't be interrupted by a signal. We need to forcefully - break the select() loop when a signal is received hence - kvm_check_received_signal(). */ - -int kvm_check_received_signal(void) -{ - if (received_signal) { - received_signal = 0; - return 1; - } - - return 0; -} - /* If we have signalfd, we mask out the signals we want to handle and then * use signalfd to listen for them. We rely on whatever the current signal * handler is to dispatch the signals when we receive them. @@ -474,8 +456,6 @@ static void sigfd_handler(void *opaque) pthread_cond_signal(&qemu_aio_cond); } } - - received_signal = 1; } /* Used to break IO thread out of select */ @@ -497,8 +477,6 @@ static void io_thread_wakeup(void *opaque) offset += len; } - - received_signal = 1; } int kvm_main_loop(void) diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h index e1e461a..34aabd2 100644 --- a/qemu/qemu-kvm.h +++ b/qemu/qemu-kvm.h @@ -114,15 +114,6 @@ static inline void kvm_sleep_end(void) kvm_mutex_lock(); } -int kvm_check_received_signal(void); - -static inline int kvm_received_signal(void) -{ - if (kvm_enabled()) - return kvm_check_received_signal(); - return 0; -} - #if !defined(SYS_signalfd) struct signalfd_siginfo { uint32_t ssi_signo; diff --git a/qemu/vl.c b/qemu/vl.c index e9f0ca4..6935a82 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -7946,23 +7946,18 @@ void main_loop_wait(int timeout) slirp_select_fill(&nfds, &rfds, &wfds, &xfds); } #endif - moreio: ret = qemu_select(nfds + 1, &rfds, &wfds, &xfds, &tv); if (ret > 0) { IOHandlerRecord **pioh; - int more = 0; for(ioh = first_io_handler; ioh != NULL; ioh = ioh->next) { if (!ioh->deleted && ioh->fd_read && FD_ISSET(ioh->fd, &rfds)) { ioh->fd_read(ioh->opaque); - if (!ioh->fd_read_poll || ioh->fd_read_poll(ioh->opaque)) - more = 1; - else + if (!(ioh->fd_read_poll && ioh->fd_read_poll(ioh->opaque))) FD_CLR(ioh->fd, &rfds); } if (!ioh->deleted && ioh->fd_write && FD_ISSET(ioh->fd, &wfds)) { ioh->fd_write(ioh->opaque); - more = 1; } } @@ -7976,8 +7971,6 @@ void main_loop_wait(int timeout) } else pioh = &ioh->next; } - if (more && !kvm_received_signal()) - goto moreio; } #if defined(CONFIG_SLIRP) if (slirp_inited) { |
From: Mohammed G. <m.g...@gm...> - 2008-05-05 13:29:19
|
On Mon, May 5, 2008 at 3:57 PM, Anthony Liguori <an...@co...> wrote: > WinXP fails to boot with your patch applied too. FWIW, Ubuntu 8.04 has > a fixed version of gfxboot that doesn't do nasty things with SS on > privileged mode transitions. > WinXP fails with the patch applied too. Ubuntu 7.10 live CD and FreeDOS don't boot but complain about instruction mov 0x11,sreg not being emulated. |
From: Carsten O. <co...@de...> - 2008-05-05 13:21:11
|
msc...@li... wrote: > I've added Heiko's patch to my patchqueue. But since this is > drivers/s390/kvm this should go in over the kvm.git. See patch below. Acked-by: Carsten Otte <co...@de...> |
From: Martin S. <sch...@de...> - 2008-05-05 13:09:45
|
On Mon, 2008-05-05 at 16:00 +0300, Avi Kivity wrote: > Christian Borntraeger wrote: > >> Hmm... this should help: > >> > >> --- > >> drivers/s390/kvm/kvm_virtio.c | 40 > >> > > +++++++++++++++++++++++----------------- > > > >> 1 file changed, 23 insertions(+), 17 deletions(-) > >> > > > > Thanks Heiko. > > I did a short test and it seems to work. > > > > Acked-by: Christian Borntraeger <bor...@de...> > > > > This looks almost identical to Rusty's patch. Who is going to send this (or > > Rustys) patch to Linus? > > > > I can, but tell me which one. Also, the patch (Heiko's) needs a > changelog entry and a signoff. I've added Heiko's patch to my patchqueue. But since this is drivers/s390/kvm this should go in over the kvm.git. See patch below. -- blue skies, Martin. "Reality continues to ruin my life." - Calvin. --- Subject: [PATCH] kvm/s390 compile error From: Heiko Carstens <hei...@de...> Fix kvm compile error: Commit c45a6816c19dee67b8f725e6646d428901a6dc24 (virtio: explicit advertisement of driver features) and commit e976a2b997fc4ad70ccc53acfe62811c4aaec851 (s390: KVM guest: virtio device support, and kvm hypercalls) don't like each other: CC drivers/s390/kvm/kvm_virtio.o drivers/s390/kvm/kvm_virtio.c:224: error: unknown field 'feature' specified in initializer drivers/s390/kvm/kvm_virtio.c:224: warning: initialization from incompatible pointer type make[3]: *** [drivers/s390/kvm/kvm_virtio.o] Error 1 Cc: Adrian Bunk <bu...@ke...> Signed-off-by: Heiko Carstens <hei...@de...> Signed-off-by: Martin Schwidefsky <sch...@de...> --- drivers/s390/kvm/kvm_virtio.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff -urpN linux-2.6/drivers/s390/kvm/kvm_virtio.c linux-2.6-patched/drivers/s390/kvm/kvm_virtio.c --- linux-2.6/drivers/s390/kvm/kvm_virtio.c 2008-05-05 13:20:45.000000000 +0200 +++ linux-2.6-patched/drivers/s390/kvm/kvm_virtio.c 2008-05-05 13:20:48.000000000 +0200 @@ -78,27 +78,32 @@ static unsigned desc_size(const struct k + desc->config_len; } -/* - * This tests (and acknowleges) a feature bit. - */ -static bool kvm_feature(struct virtio_device *vdev, unsigned fbit) +/* This gets the device's feature bits. */ +static u32 kvm_get_features(struct virtio_device *vdev) { + unsigned int i; + u32 features = 0; struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; - u8 *features; + u8 *in_features = kvm_vq_features(desc); - if (fbit / 8 > desc->feature_len) - return false; + for (i = 0; i < min(desc->feature_len * 8, 32); i++) + if (in_features[i / 8] & (1 << (i % 8))) + features |= (1 << i); + return features; +} - features = kvm_vq_features(desc); - if (!(features[fbit / 8] & (1 << (fbit % 8)))) - return false; +static void kvm_set_features(struct virtio_device *vdev, u32 features) +{ + unsigned int i; + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; + /* Second half of bitmap is features we accept. */ + u8 *out_features = kvm_vq_features(desc) + desc->feature_len; - /* - * We set the matching bit in the other half of the bitmap to tell the - * Host we want to use this feature. - */ - features[desc->feature_len + fbit / 8] |= (1 << (fbit % 8)); - return true; + memset(out_features, 0, desc->feature_len); + for (i = 0; i < min(desc->feature_len * 8, 32); i++) { + if (features & (1 << i)) + out_features[i / 8] |= (1 << (i % 8)); + } } /* @@ -221,7 +226,8 @@ static void kvm_del_vq(struct virtqueue * The config ops structure as defined by virtio config */ static struct virtio_config_ops kvm_vq_configspace_ops = { - .feature = kvm_feature, + .get_features = kvm_get_features, + .set_features = kvm_set_features, .get = kvm_get, .set = kvm_set, .get_status = kvm_get_status, |
From: Avi K. <av...@qu...> - 2008-05-05 13:00:52
|
Christian Borntraeger wrote: >> Hmm... this should help: >> >> --- >> drivers/s390/kvm/kvm_virtio.c | 40 >> > +++++++++++++++++++++++----------------- > >> 1 file changed, 23 insertions(+), 17 deletions(-) >> > > Thanks Heiko. > I did a short test and it seems to work. > > Acked-by: Christian Borntraeger <bor...@de...> > > This looks almost identical to Rusty's patch. Who is going to send this (or > Rustys) patch to Linus? > I can, but tell me which one. Also, the patch (Heiko's) needs a changelog entry and a signoff. -- error compiling committee.c: too many arguments to function |