You can subscribe to this list here.
2006 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
(33) |
Nov
(325) |
Dec
(320) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2007 |
Jan
(484) |
Feb
(438) |
Mar
(407) |
Apr
(713) |
May
(831) |
Jun
(806) |
Jul
(1023) |
Aug
(1184) |
Sep
(1118) |
Oct
(1461) |
Nov
(1224) |
Dec
(1042) |
2008 |
Jan
(1449) |
Feb
(1110) |
Mar
(1428) |
Apr
(1643) |
May
(682) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Anthony L. <an...@co...> - 2008-05-07 00:39:48
|
Avi Kivity wrote: > Kay, Allen M wrote: > >> Still todo: move vt.d to kvm-intel.ko module. >> >> > > Not sure it's the right thing to do. If we get the iommus abstracted > properly, we can rename vtd.c to dma.c and move it to virt/kvm/. > > The code is certainly a lot more about managing memory than anything vmx > specific. It's hardly x86 specific, even. > Really, an external interface to KVM that allowed someone to query the GPA => PA mapping would suffice. It should not fault in pages that aren't present and we should provide notifications for when the mapping changes for a given reason. Userspace can enforce the requirement that memory remains present via mlock(). This allows us to implement a PV API for DMA registration without the IOMMU code having any particular knowledge of it. Regards, Anthony Liguori |
From: Anthony L. <an...@co...> - 2008-05-07 00:34:22
|
Kay, Allen M wrote: > Intel-iommu driver changes for kvm vt-d support. Important changes are > in intel-iommu.c. The rest of the changes are for moving intel-iommu.h > and iova.h from drivers/pci directory to include/linux directory. > > Signed-off-by: Allen M Kay <all...@in...> > > ---- > > b/drivers/pci/dmar.c | 4 > b/drivers/pci/intel-iommu.c | 26 ++- > b/drivers/pci/iova.c | 2 > b/include/linux/intel-iommu.h | 344 > ++++++++++++++++++++++++++++++++++++++++++ > b/include/linux/iova.h | 52 ++++++ > drivers/pci/intel-iommu.h | 344 > ------------------------------------------ > drivers/pci/iova.h | 52 ------ > 7 files changed, 416 insertions(+), 408 deletions(-) > > ---- > > diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c > index f941f60..a58a5b0 100644 > --- a/drivers/pci/dmar.c > +++ b/drivers/pci/dmar.c > @@ -26,8 +26,8 @@ > > #include <linux/pci.h> > #include <linux/dmar.h> > -#include "iova.h" > -#include "intel-iommu.h" > +#include <linux/iova.h> > +#include <linux/intel-iommu.h> > > #undef PREFIX > #define PREFIX "DMAR:" > diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c > index 4cb949f..bfa888b 100644 > --- a/drivers/pci/intel-iommu.c > +++ b/drivers/pci/intel-iommu.c > @@ -31,8 +31,8 @@ > #include <linux/dmar.h> > #include <linux/dma-mapping.h> > #include <linux/mempool.h> > -#include "iova.h" > -#include "intel-iommu.h" > +#include <linux/iova.h> > +#include <linux/intel-iommu.h> > #include <asm/proto.h> /* force_iommu in this header in x86-64*/ > #include <asm/cacheflush.h> > #include <asm/gart.h> > @@ -1056,7 +1056,7 @@ static void free_iommu(struct intel_iommu *iommu) > kfree(iommu); > } > > -static struct dmar_domain * iommu_alloc_domain(struct intel_iommu > *iommu) > +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) > { > unsigned long num; > unsigned long ndomains; > @@ -1086,8 +1086,9 @@ static struct dmar_domain * > iommu_alloc_domain(struct intel_iommu *iommu) > > return domain; > } > +EXPORT_SYMBOL_GPL(iommu_alloc_domain); > > -static void iommu_free_domain(struct dmar_domain *domain) > +void iommu_free_domain(struct dmar_domain *domain) > { > unsigned long flags; > > @@ -1095,6 +1096,7 @@ static void iommu_free_domain(struct dmar_domain > *domain) > clear_bit(domain->id, domain->iommu->domain_ids); > spin_unlock_irqrestore(&domain->iommu->lock, flags); > } > +EXPORT_SYMBOL_GPL(iommu_free_domain); > > static struct iova_domain reserved_iova_list; > static struct lock_class_key reserved_alloc_key; > @@ -1160,7 +1162,7 @@ static inline int guestwidth_to_adjustwidth(int > gaw) > return agaw; > } > > -static int domain_init(struct dmar_domain *domain, int guest_width) > +int domain_init(struct dmar_domain *domain, int guest_width) > { > I think it's already been mentioned, but these are pretty terrible names if you're exporting these symbols. Linux supports other IOMMUs so VT-d should not be hogging the iommu_* namespace. Regards, Anthony Liguori |
From: Anthony L. <an...@co...> - 2008-05-07 00:30:59
|
Kay, Allen M wrote: > Kvm kernel changes. > > Signed-off-by: Allen M Kay <all...@in...> > > ------ > arch/x86/kvm/Makefile | 2 > arch/x86/kvm/vtd.c | 183 > +++++++++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/x86.c | 7 + > include/asm-x86/kvm_host.h | 3 > include/asm-x86/kvm_para.h | 1 > include/linux/kvm_host.h | 6 + > virt/kvm/kvm_main.c | 3 > 7 files changed, 204 insertions(+), 1 deletion(-) > > ------ > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > index c97d35c..b1057fb 100644 > --- a/arch/x86/kvm/Makefile > +++ b/arch/x86/kvm/Makefile > @@ -12,7 +12,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm > kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o > lapic.o \ > i8254.o > obj-$(CONFIG_KVM) += kvm.o > -kvm-intel-objs = vmx.o > +kvm-intel-objs = vmx.o vtd.o > obj-$(CONFIG_KVM_INTEL) += kvm-intel.o > kvm-amd-objs = svm.o > obj-$(CONFIG_KVM_AMD) += kvm-amd.o > diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c > new file mode 100644 > index 0000000..9a080b5 > --- /dev/null > +++ b/arch/x86/kvm/vtd.c > @@ -0,0 +1,183 @@ > +/* > + * Copyright (c) 2006, Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify > it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but > WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY > or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public > License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License > along with > + * this program; if not, write to the Free Software Foundation, Inc., > 59 Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. > + * > + * Copyright (C) 2006-2008 Intel Corporation > + * Author: Allen M. Kay <all...@in...> > + * Author: Weidong Han <wei...@in...> > + */ > + > +#include <linux/list.h> > +#include <linux/kvm_host.h> > +#include <linux/pci.h> > +#include <linux/dmar.h> > +#include <linux/intel-iommu.h> > + > +//#define DEBUG > + > +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 > + > +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev > *dev); > +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu); > +void iommu_free_domain(struct dmar_domain *domain); > +int domain_init(struct dmar_domain *domain, int guest_width); > +int domain_context_mapping(struct dmar_domain *d, > + struct pci_dev *pdev); > +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, > + u64 hpa, size_t size, int prot); > +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 > devfn); > +struct dmar_domain * find_domain(struct pci_dev *pdev); > These definitely need to be moved to a common header. > + > +int kvm_iommu_map_pages(struct kvm *kvm, > + gfn_t base_gfn, unsigned long npages) > +{ > + unsigned long gpa; > + struct page *page; > + hpa_t hpa; > + int j, write; > + struct vm_area_struct *vma; > + > + if (!kvm->arch.domain) > + return 1; > In the kernel, we should be using -errno to return error codes. > + gpa = base_gfn << PAGE_SHIFT; > + page = gfn_to_page(kvm, base_gfn); > + hpa = page_to_phys(page); > Please use gfn_to_pfn(). Keep in mind, by using gfn_to_page/gfn_to_pfn, you take a reference to a page. You're leaking that reference here. > + printk(KERN_DEBUG "kvm_iommu_map_page: gpa = %lx\n", gpa); > + printk(KERN_DEBUG "kvm_iommu_map_page: hpa = %llx\n", hpa); > + printk(KERN_DEBUG "kvm_iommu_map_page: size = %lx\n", > + npages*PAGE_SIZE); > + > + for (j = 0; j < npages; j++) { > + gpa += PAGE_SIZE; > + page = gfn_to_page(kvm, gpa >> PAGE_SHIFT); > + hpa = page_to_phys(page); > Again, gfn_to_pfn() and you're taking a reference that I never see you releasing. > + domain_page_mapping(kvm->arch.domain, gpa, hpa, > PAGE_SIZE, > + DMA_PTE_READ | DMA_PTE_WRITE); > + vma = find_vma(current->mm, gpa); > + if (!vma) > + return 1; > + write = (vma->vm_flags & VM_WRITE) != 0; > + get_user_pages(current, current->mm, gpa, > + PAGE_SIZE, write, 0, NULL, NULL); > I don't quite see what you're doing here. It looks like you're trying to pre-fault the page in? gfn_to_pfn will do that for you. You're taking a bunch of references here that are never getting released. I think the general approach here is a bit faulty. I think what we want to do is mlock() from userspace to ensure all the memory is present for the guest. We should combine this with MMU-notifiers such that whenever the userspace mapping changes, we can reprogram the IOMMU. In the case where we don't have MMU-notifiers, we simply hold on to the memory forever and never program the IOMMU. The initial mlock() in userspace is somewhat of a nop here but it's important with MMU-notifiers because we will no longer be holding a reference for guest memory. We have to ensure we don't swap KVM guest memory while using hardware pass-through, but AFAICT, we do not need to make the memory non-reclaimable As long as we reprogram the IOMMU with a new, valid, mapping everything should be fine. mlock() really gives us the right semantics. Semantically, a PV API that supports DMA window registration simply mlock()s the DMA regions on behalf of the guest. No special logic should be needed. Regards, Anthony Liguori |
From: Glauber C. <gc...@re...> - 2008-05-06 23:08:44
|
Alexander Graf wrote: > > On May 6, 2008, at 6:27 PM, Glauber Costa wrote: > >> qemu recently added support for 3dnow instructions. Because of >> that, 3dnow will be featured among cpuid bits. But this will >> break kvm in cpus that don't have those instructions (which includes >> my laptop). So we fixup our cpuid before exposing it to the guest. > > I actually don't see where the problem is here. As far as I read the > code, the CPUID feature function gets received from the host CPU and > bitwise ANDed with a bunch of features that are known to work. What's > wrong with that approach? Probably is that besides that known to work features, there are also features that qemu puts in unconditionally. Among them, 3DNOW. > But I'm pretty sure Dao can tell us a lot more about this. Sure, it would be welcome. |
From: Alexander G. <ag...@su...> - 2008-05-06 23:08:43
|
On May 6, 2008, at 6:27 PM, Glauber Costa wrote: > qemu recently added support for 3dnow instructions. Because of > that, 3dnow will be featured among cpuid bits. But this will > break kvm in cpus that don't have those instructions (which includes > my laptop). So we fixup our cpuid before exposing it to the guest. I actually don't see where the problem is here. As far as I read the code, the CPUID feature function gets received from the host CPU and bitwise ANDed with a bunch of features that are known to work. What's wrong with that approach? But I'm pretty sure Dao can tell us a lot more about this. Has there been any progress in getting the new CPUID code in? I think I could review this sometime soon. Alex |
From: Kay, A. M <all...@in...> - 2008-05-06 22:01:49
|
>> + >> +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 >> + >> +struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev >> *dev); >> +struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu); >> +void iommu_free_domain(struct dmar_domain *domain); >> +int domain_init(struct dmar_domain *domain, int guest_width); >> +int domain_context_mapping(struct dmar_domain *d, >> + struct pci_dev *pdev); >> +int domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, >> + u64 hpa, size_t size, int prot); >> +void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 >> devfn); >> +struct dmar_domain * find_domain(struct pci_dev *pdev); > >Please move these to a .h file and also prefix appropriate keywords: > >domain_context_mapping is confusing and since it's an intel >iommu-only thing, >use something like > >intel_iommu_domain_context_mapping > These functions currently are just direct calls into existing functions in drivers/pci/intel-iommu.c - hence the lack of more descriptive name in KVM environment. To get more relavant names in KVM environment, we can either create wrappers for these functions or using a iommu function table. Allen |
From: Anthony L. <ali...@us...> - 2008-05-06 19:45:55
|
Marcelo Tosatti wrote: > Looks good (the whole series). > > Needs some good testing of course... Have you tested migration/loadvm? > No, but I will before resubmitting (which should be sometime tomorrow). Regards, Anthony Liguori > On Mon, May 05, 2008 at 08:47:12AM -0500, Anthony Liguori wrote: > >> It's a little odd to use signals to raise a notification on a file descriptor >> when we can just work directly with a file descriptor instead. This patch >> converts the SIGUSR1 based notification in the io-thread to instead use an >> eventfd file descriptor. If eventfd isn't available, we use a pipe() instead. >> >> The benefit of using eventfd is that multiple notifications will be batched >> into a signal IO event. >> >> Signed-off-by: Anthony Liguori <ali...@us...> >> |
From: Marcelo T. <mto...@re...> - 2008-05-06 19:43:12
|
Looks good (the whole series). Needs some good testing of course... Have you tested migration/loadvm? On Mon, May 05, 2008 at 08:47:12AM -0500, Anthony Liguori wrote: > It's a little odd to use signals to raise a notification on a file descriptor > when we can just work directly with a file descriptor instead. This patch > converts the SIGUSR1 based notification in the io-thread to instead use an > eventfd file descriptor. If eventfd isn't available, we use a pipe() instead. > > The benefit of using eventfd is that multiple notifications will be batched > into a signal IO event. > > Signed-off-by: Anthony Liguori <ali...@us...> |
From: Andrea A. <an...@qu...> - 2008-05-06 17:55:04
|
Hello everyone, This is to allow GRU code to call __mmu_notifier_register inside the mmap_sem (write mode is required as documented in the patch). It also removes the requirement to implement ->release as it's not guaranteed all users will really need it. I didn't integrate the search function as we can sort that out after 2.6.26 is out and it wasn't entirely obvious it's really needed, as the driver should be able to track if a mmu notifier is registered in the container. diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -29,10 +29,25 @@ struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are - * freed. It's mandatory to implement this method. This can - * run concurrently with other mmu notifier methods and it + * freed. This can run concurrently with other mmu notifier + * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the - * secondary mmu. + * secondary mmu. If this method isn't implemented you've to + * be sure that nothing could possibly write to the pages + * through the secondary mmu by the time the last thread with + * tsk->mm == mm exits. + * + * As side note: the pages freed after ->release returns could + * be immediately reallocated by the gart at an alias physical + * address with a different cache model, so if ->release isn't + * implemented because all _software_ driven memory accesses + * through the secondary mmu are terminated by the time the + * last thread of this mm quits, you've also to be sure that + * speculative _hardware_ operations can't allocate dirty + * cachelines in the cpu that could not be snooped and made + * coherent with the other read and write operations happening + * through the gart alias address, so leading to memory + * corruption. */ void (*release)(struct mmu_notifier *mn, struct mm_struct *mm); diff --git a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2340,13 +2340,20 @@ static inline void __mm_unlock(spinlock_ /* * This operation locks against the VM for all pte/vma/mm related * operations that could ever happen on a certain mm. This includes - * vmtruncate, try_to_unmap, and all page faults. The holder - * must not hold any mm related lock. A single task can't take more - * than one mm_lock in a row or it would deadlock. + * vmtruncate, try_to_unmap, and all page faults. * - * The mmap_sem must be taken in write mode to block all operations - * that could modify pagetables and free pages without altering the - * vma layout (for example populate_range() with nonlinear vmas). + * The caller must take the mmap_sem in read or write mode before + * calling mm_lock(). The caller isn't allowed to release the mmap_sem + * until mm_unlock() returns. + * + * While mm_lock() itself won't strictly require the mmap_sem in write + * mode to be safe, in order to block all operations that could modify + * pagetables and free pages without need of altering the vma layout + * (for example populate_range() with nonlinear vmas) the mmap_sem + * must be taken in write mode by the caller. + * + * A single task can't take more than one mm_lock in a row or it would + * deadlock. * * The sorting is needed to avoid lock inversion deadlocks if two * tasks run mm_lock at the same time on different mm that happen to @@ -2377,17 +2384,13 @@ int mm_lock(struct mm_struct *mm, struct { spinlock_t **anon_vma_locks, **i_mmap_locks; - down_write(&mm->mmap_sem); if (mm->map_count) { anon_vma_locks = vmalloc(sizeof(spinlock_t *) * mm->map_count); - if (unlikely(!anon_vma_locks)) { - up_write(&mm->mmap_sem); + if (unlikely(!anon_vma_locks)) return -ENOMEM; - } i_mmap_locks = vmalloc(sizeof(spinlock_t *) * mm->map_count); if (unlikely(!i_mmap_locks)) { - up_write(&mm->mmap_sem); vfree(anon_vma_locks); return -ENOMEM; } @@ -2426,10 +2429,12 @@ static void mm_unlock_vfree(spinlock_t * /* * mm_unlock doesn't require any memory allocation and it won't fail. * + * The mmap_sem cannot be released until mm_unlock returns. + * * All memory has been previously allocated by mm_lock and it'll be * all freed before returning. Only after mm_unlock returns, the * caller is allowed to free and forget the mm_lock_data structure. - * + * * mm_unlock runs in O(N) where N is the max number of VMAs in the * mm. The max number of vmas is defined in * /proc/sys/vm/max_map_count. @@ -2444,5 +2449,4 @@ void mm_unlock(struct mm_struct *mm, str mm_unlock_vfree(data->i_mmap_locks, data->nr_i_mmap_locks); } - up_write(&mm->mmap_sem); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -59,7 +59,8 @@ void __mmu_notifier_release(struct mm_st * from establishing any more sptes before all the * pages in the mm are freed. */ - mn->ops->release(mn, mm); + if (mn->ops->release) + mn->ops->release(mn, mm); srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); spin_lock(&mm->mmu_notifier_mm->lock); } @@ -144,20 +145,9 @@ void __mmu_notifier_invalidate_range_end srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } -/* - * Must not hold mmap_sem nor any other VM related lock when calling - * this registration function. Must also ensure mm_users can't go down - * to zero while this runs to avoid races with mmu_notifier_release, - * so mm has to be current->mm or the mm should be pinned safely such - * as with get_task_mm(). If the mm is not current->mm, the mm_users - * pin should be released by calling mmput after mmu_notifier_register - * returns. mmu_notifier_unregister must be always called to - * unregister the notifier. mm_count is automatically pinned to allow - * mmu_notifier_unregister to safely run at any time later, before or - * after exit_mmap. ->release will always be called before exit_mmap - * frees the pages. - */ -int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +static int do_mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm, + int take_mmap_sem) { struct mm_lock_data data; struct mmu_notifier_mm * mmu_notifier_mm; @@ -174,6 +164,8 @@ int mmu_notifier_register(struct mmu_not if (unlikely(ret)) goto out_kfree; + if (take_mmap_sem) + down_write(&mm->mmap_sem); ret = mm_lock(mm, &data); if (unlikely(ret)) goto out_cleanup; @@ -200,6 +192,8 @@ int mmu_notifier_register(struct mmu_not mm_unlock(mm, &data); out_cleanup: + if (take_mmap_sem) + up_write(&mm->mmap_sem); if (mmu_notifier_mm) cleanup_srcu_struct(&mmu_notifier_mm->srcu); out_kfree: @@ -209,7 +203,35 @@ out: BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. Must also ensure mm_users can't go down + * to zero while this runs to avoid races with mmu_notifier_release, + * so mm has to be current->mm or the mm should be pinned safely such + * as with get_task_mm(). If the mm is not current->mm, the mm_users + * pin should be released by calling mmput after mmu_notifier_register + * returns. mmu_notifier_unregister must be always called to + * unregister the notifier. mm_count is automatically pinned to allow + * mmu_notifier_unregister to safely run at any time later, before or + * after exit_mmap. ->release will always be called before exit_mmap + * frees the pages. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 1); +} EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * Same as mmu_notifier_register but here the caller must hold the + * mmap_sem in write mode. + */ +int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 0); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_register); /* this is called after the last mmu_notifier_unregister() returned */ void __mmu_notifier_mm_destroy(struct mm_struct *mm) @@ -251,7 +273,8 @@ void mmu_notifier_unregister(struct mmu_ * guarantee ->release is called before freeing the * pages. */ - mn->ops->release(mn, mm); + if (mn->ops->release) + mn->ops->release(mn, mm); srcu_read_unlock(&mm->mmu_notifier_mm->srcu, srcu); } else spin_unlock(&mm->mmu_notifier_mm->lock); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -148,6 +148,8 @@ static inline int mm_has_notifiers(struc extern int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); +extern int __mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); |
From: Mohammed G. <m.g...@gm...> - 2008-05-06 17:05:41
|
On Tue, May 6, 2008 at 5:30 PM, Anthony Liguori <an...@co...> wrote: > Guillaume Thouvenin wrote: > > > On Mon, 5 May 2008 16:29:21 +0300 > > "Mohammed Gamal" <m.g...@gm...> wrote: > > > > > > > > > On Mon, May 5, 2008 at 3:57 PM, Anthony Liguori <an...@co...> > wrote: > > > > > > > > > > > > > WinXP fails to boot with your patch applied too. FWIW, Ubuntu 8.04 > has > > > > a fixed version of gfxboot that doesn't do nasty things with SS on > > > > privileged mode transitions. > > > > > > > > > > > > > > > WinXP fails with the patch applied too. Ubuntu 7.10 live CD and > > > FreeDOS don't boot but complain about instruction mov 0x11,sreg not > > > being emulated. > > > > > > > > > > Can you try with this one please? > > On my computer it boots ubuntu-8.04-desktop-i386.iso liveCD and also > > openSUSE-10.3-GM-x86_64-mini.iso > > > > > > 8.04 is not a good test-case. 7.10 is what you want to try. > > The good news is, 7.10 appears to work! The bad news is that about 20% of > the time, it crashes and displays the following: > > kvm_run: failed entry, reason 5 > kvm_run returned -8 > > So something appears to be a bit buggy. Still, very good work! > > Regards, > > Anthony Liguori > > 7.10 liveCD doesn't work with me at all. It only works with -no-kvm |
From: Gerd H. <kr...@re...> - 2008-05-06 16:44:56
|
Gerd Hoffmann wrote: > Marcelo Tosatti wrote: >> F8 host, recent kvm-userspace.git (so with IO thread), recent kvm.git >> (plus your patches), haven't tried 2x but I think 4x is not necessary to >> reproduce the problem. > > Ok, see it too. Seem to be actually two (maybe related) problems. > > First the guest hangs hard after a while, burning 100% CPU time > (deadlocked I guess), doesn't respond to sysrq any more. Is there some > easy way to get the guest vcpu state then? Hmm, "info registers" in qemu monitor hangs ... cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |
From: Marcelo T. <mto...@re...> - 2008-05-06 16:41:38
|
Slots 9 and 25 were using the identifier of the previous slot. Signed-off-by: Marcelo Tosatti <mto...@re...> diff --git a/bios/acpi-dsdt.dsl b/bios/acpi-dsdt.dsl index d2e33f4..c145c4b 100755 --- a/bios/acpi-dsdt.dsl +++ b/bios/acpi-dsdt.dsl @@ -269,10 +269,10 @@ DefinitionBlock ( Package() {0x0008ffff, 3, LNKC, 0}, // PCI Slot 9 - Package() {0x0008ffff, 0, LNKA, 0}, - Package() {0x0008ffff, 1, LNKB, 0}, - Package() {0x0008ffff, 2, LNKC, 0}, - Package() {0x0008ffff, 3, LNKD, 0}, + Package() {0x0009ffff, 0, LNKA, 0}, + Package() {0x0009ffff, 1, LNKB, 0}, + Package() {0x0009ffff, 2, LNKC, 0}, + Package() {0x0009ffff, 3, LNKD, 0}, // PCI Slot 10 Package() {0x000affff, 0, LNKB, 0}, @@ -365,10 +365,10 @@ DefinitionBlock ( Package() {0x0018ffff, 3, LNKC, 0}, // PCI Slot 25 - Package() {0x0018ffff, 0, LNKA, 0}, - Package() {0x0018ffff, 1, LNKB, 0}, - Package() {0x0018ffff, 2, LNKC, 0}, - Package() {0x0018ffff, 3, LNKD, 0}, + Package() {0x0019ffff, 0, LNKA, 0}, + Package() {0x0019ffff, 1, LNKB, 0}, + Package() {0x0019ffff, 2, LNKC, 0}, + Package() {0x0019ffff, 3, LNKD, 0}, // PCI Slot 26 Package() {0x001affff, 0, LNKB, 0}, |
From: Glauber C. <gc...@re...> - 2008-05-06 16:33:45
|
qemu recently added support for 3dnow instructions. Because of that, 3dnow will be featured among cpuid bits. But this will break kvm in cpus that don't have those instructions (which includes my laptop). So we fixup our cpuid before exposing it to the guest. Signed-off-by: Glauber Costa <gc...@re...> --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++---- include/asm-x86/cpufeature.h | 2 ++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 979f983..e79fcd5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -919,7 +919,7 @@ static int is_efer_nx(void) return efer & EFER_NX; } -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +static void cpuid_fix_caps(struct kvm_vcpu *vcpu) { int i; struct kvm_cpuid_entry2 *e, *entry; @@ -932,6 +932,20 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) break; } } + + /* 3DNOWEXT */ + if (entry && (entry->edx & (1 << 30)) && !cpu_has_3dnowext) { + entry->edx &= ~(1 << 30); + printk(KERN_INFO "kvm: guest 3DNOWEXT capability removed\n"); + } + + /* 3DNOW */ + if (entry && (entry->edx & (1 << 31)) && !cpu_has_3dnow) { + entry->edx &= ~(1 << 31); + printk(KERN_INFO "kvm: guest 3DNOW capability removed\n"); + } + + /* NX */ if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { entry->edx &= ~(1 << 20); printk(KERN_INFO "kvm: guest NX capability removed\n"); @@ -970,7 +984,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries[i].padding[2] = 0; } vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); + cpuid_fix_caps(vcpu); r = 0; out_free: @@ -1061,8 +1075,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, bit(X86_FEATURE_LM) | #endif bit(X86_FEATURE_MMXEXT) | - bit(X86_FEATURE_3DNOWEXT) | - bit(X86_FEATURE_3DNOW); + (bit(X86_FEATURE_3DNOWEXT) && cpu_has_3dnowext) | + (bit(X86_FEATURE_3DNOW) && cpu_has_3dnow); const u32 kvm_supported_word3_x86_features = bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); const u32 kvm_supported_word6_x86_features = diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 0d609c8..efbc5ce 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -187,6 +187,8 @@ extern const char * const x86_power_flags[32]; #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) +#define cpu_has_3dnow boot_cpu_has(X86_FEATURE_3DNOW) +#define cpu_has_3dnowext boot_cpu_has(X86_FEATURE_3DNOWEXT) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- 1.5.0.6 |
From: Marcelo T. <mto...@re...> - 2008-05-06 16:31:58
|
Hi Karl, On Mon, May 05, 2008 at 08:40:22PM -0500, Karl Rister wrote: > On Thursday 01 May 2008 7:16:53 pm Marcelo Tosatti wrote: > > Does -no-kvm-irqchip or -no-kvm-pit makes a difference? If not, please > > grab kvm_stat --once output when that happens. > > Per some suggestions I have moved up to kvm-68 which is better, but still > having problems. Replicating the problem with only one guest spinning has > proven quite difficult, but attempting to boot a large smp guest can reliably > recreate the problem. Using -no-kvm-pit did not help the large guest > and -no-kvm-irqchip made it seize up even earlier with only 1 cpu spinning > instead of all of them. > > > > > Also run "readprofile -r ; readprofile -m System-map-of-guest.map" with the > > host booted with "profile=kvm". Make sure all guests are running the same > > kernel image. > > I got this from a spinning 16-way guest with only 8 of the host CPUs online > and without either -no-kvm-irqchip or -no-kvm-pit: > > [root@newcastle ~]# readprofile -r ; readprofile -m > karl/System.map-2.6.25-03591-g873c05f > 101 native_read_tsc 3.4828 > 1 read_persistent_clock 0.0192 > 25 kvm_clock_read 0.2660 > 95 getnstimeofday 0.7252 > 13 update_wall_time 0.0138 > 1 second_overflow 0.0020 > readprofile: profile address out of range. Wrong map file? KVM clock has known problems with SMP guests, please disable it for now. Also disable LOCKDEP on the guest if it has more VCPU's than CPU's available in the host. |
From: Marcelo T. <mto...@re...> - 2008-05-06 16:30:00
|
Otherwise hlt emulation fails if PIT is not injecting IRQ's. Signed-off-by: Marcelo Tosatti <mto...@re...> diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1646102..07f9ff1 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -216,7 +216,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0) + if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) return atomic_read(&pit->pit_state.pit_timer.pending); return 0; |
From: Gerd H. <kr...@re...> - 2008-05-06 16:00:05
|
Marcelo Tosatti wrote: > F8 host, recent kvm-userspace.git (so with IO thread), recent kvm.git > (plus your patches), haven't tried 2x but I think 4x is not necessary to > reproduce the problem. Ok, see it too. Seem to be actually two (maybe related) problems. First the guest hangs hard after a while, burning 100% CPU time (deadlocked I guess), doesn't respond to sysrq any more. Is there some easy way to get the guest vcpu state then? EIP for starters, preferably with stack trace? The other one is that one ticks slower than the other. I don't see it from start, but after a while it starts happening (unless the guest deadlocks before ...). cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |
From: Avi K. <av...@qu...> - 2008-05-06 15:11:06
|
Jerone Young wrote: > These patches fell through the cracks. > Unfortunately, the cracks are getting wider. Anyway, applied, thanks. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Karl R. <km...@us...> - 2008-05-06 15:10:42
|
On Thursday 01 May 2008 7:16:53 pm Marcelo Tosatti wrote: > Does -no-kvm-irqchip or -no-kvm-pit makes a difference? If not, please > grab kvm_stat --once output when that happens. Per some suggestions I have moved up to kvm-68 which is better, but still having problems. Replicating the problem with only one guest spinning has proven quite difficult, but attempting to boot a large smp guest can reliably recreate the problem. Using -no-kvm-pit did not help the large guest and -no-kvm-irqchip made it seize up even earlier with only 1 cpu spinning instead of all of them. > > Also run "readprofile -r ; readprofile -m System-map-of-guest.map" with the > host booted with "profile=kvm". Make sure all guests are running the same > kernel image. I got this from a spinning 16-way guest with only 8 of the host CPUs online and without either -no-kvm-irqchip or -no-kvm-pit: [root@newcastle ~]# readprofile -r ; readprofile -m karl/System.map-2.6.25-03591-g873c05f 101 native_read_tsc 3.4828 1 read_persistent_clock 0.0192 25 kvm_clock_read 0.2660 95 getnstimeofday 0.7252 13 update_wall_time 0.0138 1 second_overflow 0.0020 readprofile: profile address out of range. Wrong map file? The kvm_stat output during this is: [root@newcastle ~]# kvm_stat --once efer_reload 23354 0 exits 3587109 2250 fpu_reload 1934298 0 halt_exits 4583 0 halt_wakeup 42 0 host_state_reload 2165502 167 hypercalls 1482 0 insn_emulation 900199 0 insn_emulation_fail 0 0 invlpg 0 0 io_exits 1983116 0 irq_exits 427728 2250 irq_window 0 0 largepages 0 0 mmio_exits 163522 0 mmu_cache_miss 176 0 mmu_flooded 99 0 mmu_pde_zapped 191 0 mmu_pte_updated 10 0 mmu_pte_write 59030 0 mmu_recycled 0 0 mmu_shadow_zapped 99 0 pf_fixed 14890 0 pf_guest 0 0 remote_tlb_flush 29 0 request_irq 0 0 signal_exits 1 0 tlb_flush 481952 0 The output with -no-kvm-pit looked almost identical and with -no-kvm-pit there was no samples registered for either tool. -- Karl Rister IBM Linux Performance Team km...@us... (512) 838-1553 (t/l 678) |
From: Andrea A. <an...@qu...> - 2008-05-06 14:47:20
|
On Mon, May 05, 2008 at 02:46:25PM -0500, Jack Steiner wrote: > If a task fails to unmap a GRU segment, they still exist at the start of Yes, this will also happen in case the well behaved task receives SIGKILL, so you can test it that way too. > exit. On the ->release callout, I set a flag in the container of my > mmu_notifier that exit has started. As VMA are cleaned up, TLB flushes > are skipped because of the flag is set. When the GRU VMA is deleted, I free GRU TLB flushes aren't skipped because your flag is set but because __mmu_notifier_release already executed list_del_init_rcu(&grunotifier->hlist) before proceeding with unmap_vmas. > my structure containing the notifier. As long as nobody can write through the already established gru tlbs and nobody can establish new tlbs after exit_mmap run you don't strictly need ->release. > I _think_ works. Do you see any problems? You can remove the flag and ->release and ->clear_flush_young (if you keep clear_flush_young implemented it should return 0). The synchronize_rcu after mmu_notifier_register can also be dropped thanks to mm_lock(). gru_drop_mmu_notifier should be careful with current->mm if you're using an fd and if the fd can be passed to a different task through unix sockets (you should probably fail any operation if current->mm != gru->mm). The way I use ->release in KVM is to set the root hpa to -1UL (invalid) as a debug trap. That's only for debugging because even if tlb entries and sptes are still established on the secondary mmu they are only relevant when the cpu jumps to guest mode and that can never happen again after exit_mmap is started. > I should also mention that I have an open-coded function that possibly > belongs in mmu_notifier.c. A user is allowed to have multiple GRU segments. > Each GRU has a couple of data structures linked to the VMA. All, however, > need to share the same notifier. I currently open code a function that > scans the notifier list to determine if a GRU notifier already exists. > If it does, I update a refcnt & use it. Otherwise, I register a new > one. All of this is protected by the mmap_sem. > > Just in case I mangled the above description, I'll attach a copy of the GRU mmuops > code. Well that function needs fixing w.r.t. srcu. Are you sure you want to search for mn->ops == gru_mmuops and not for mn == gmn? And if you search for mn why can't you keep track of the mn being registered or unregistered outside of the mmu_notifier layer? Set a bitflag in the container after mmu_notifier_register returns and a clear it after _unregister returns. I doubt saving one bitflag is worth searching the list and your approach make it obvious that you've to protect the bitflag and the register/unregister under write-mmap_sem yourself. Otherwise the find function will return an object that can be freed at any time if somebody calls unregister and kfree. (synchronize_srcu in mmu_notifier_unregister won't wait for anything but some outstanding srcu_read_lock) |
From: Avi K. <av...@qu...> - 2008-05-06 14:41:47
|
Zhang, Xiantao wrote: > Hi, Avi > This patch should go into RC1, otherwise it will block kvm/ia64 > userspace build. > > diff --git a/include/asm-ia64/kvm.h b/include/asm-ia64/kvm.h > index eb2d355..62b5fad 100644 > --- a/include/asm-ia64/kvm.h > +++ b/include/asm-ia64/kvm.h > @@ -22,7 +22,12 @@ > */ > > #include <asm/types.h> > + > +#ifdef __KERNEL__ > #include <asm/fpu.h> > +#else > +#include <signal.h> > +#endif > Fishy. A kernel header including a userspace header? Maybe you need to include <linux/signal.h> unconditionally? -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Avi K. <av...@qu...> - 2008-05-06 14:39:06
|
Martin Schwidefsky wrote: > I've added Heiko's patch to my patchqueue. But since this is > drivers/s390/kvm this should go in over the kvm.git. See patch below. > > Thanks, I added this to my queue as well. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Avi K. <av...@qu...> - 2008-05-06 14:33:18
|
Jan Luebbe wrote: >> 0f 0d 0b prefetchw (%ebx) >> >> This is an AMD 3Dnow! instruction, which is not supported on Intel >> processors. I guess the 3Dnow! cpuid bit leaked in via the qemu merge. >> >> I guess two fixes are needed: >> - remove the 3Dnow! bit >> - add emulation for prefetchw (easy, as it doesn't need to do anything) >> to support live migration from AMD to Intel >> > > This problem still occours with kvm-68. Which CPUs will be affected by > this (is it only the Core Duo)? > All Intels. > I'm currently delaying the upload of a new kvm package to debian because > of this. > I've fixed it for kvm-69. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Anthony L. <an...@co...> - 2008-05-06 14:31:12
|
Guillaume Thouvenin wrote: > On Mon, 5 May 2008 16:29:21 +0300 > "Mohammed Gamal" <m.g...@gm...> wrote: > > >> On Mon, May 5, 2008 at 3:57 PM, Anthony Liguori <an...@co...> wrote: >> >> >>> WinXP fails to boot with your patch applied too. FWIW, Ubuntu 8.04 has >>> a fixed version of gfxboot that doesn't do nasty things with SS on >>> privileged mode transitions. >>> >>> >> WinXP fails with the patch applied too. Ubuntu 7.10 live CD and >> FreeDOS don't boot but complain about instruction mov 0x11,sreg not >> being emulated. >> > > Can you try with this one please? > On my computer it boots ubuntu-8.04-desktop-i386.iso liveCD and also > openSUSE-10.3-GM-x86_64-mini.iso > 8.04 is not a good test-case. 7.10 is what you want to try. The good news is, 7.10 appears to work! The bad news is that about 20% of the time, it crashes and displays the following: kvm_run: failed entry, reason 5 kvm_run returned -8 So something appears to be a bit buggy. Still, very good work! Regards, Anthony Liguori |
From: Avi K. <av...@qu...> - 2008-05-06 13:55:38
|
Avi Kivity wrote: > [Resurrecting post from the dead] > > > Marcelo Tosatti wrote: >> Forcing clustered APIC mode works only on SMP, and there were high CPU >> consumption on Windows SMP guests due to C3 state being reported (fixed >> in kvm-30 something). >> >> So perhaps: >> - Faking clustered APIC on SMP - Faking C3 on UP >> >> And turning of the TSC bit (for 32-bit guests). >> >> Is the way to go? >> Avi, do you understand why C3 was causing the Windows SMP problems ? >> >> > > It's probably inb()ing on the port in a loop. It's not SMP causing > the problems, but the ACPI HAL. I'll check this. > Yes, it's reading 0xb010 and 0xb014, which ought to place the cpu in sleep mode, but don't. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Guillaume T. <gui...@ex...> - 2008-05-06 13:38:43
|
On Mon, 5 May 2008 16:29:21 +0300 "Mohammed Gamal" <m.g...@gm...> wrote: > On Mon, May 5, 2008 at 3:57 PM, Anthony Liguori <an...@co...> wrote: > > > WinXP fails to boot with your patch applied too. FWIW, Ubuntu 8.04 has > > a fixed version of gfxboot that doesn't do nasty things with SS on > > privileged mode transitions. > > > WinXP fails with the patch applied too. Ubuntu 7.10 live CD and > FreeDOS don't boot but complain about instruction mov 0x11,sreg not > being emulated. Can you try with this one please? On my computer it boots ubuntu-8.04-desktop-i386.iso liveCD and also openSUSE-10.3-GM-x86_64-mini.iso I will try FreeDOS and WinXP if I can find one ;) Regards, Guillaume --- diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 26c4f02..6e76c2e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1272,7 +1272,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); +#if 0 vmcs_write16(GUEST_SS_SELECTOR, 0); +#endif vmcs_write32(GUEST_SS_AR_BYTES, 0x93); vmcs_write16(GUEST_CS_SELECTOR, @@ -2633,6 +2635,73 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int invalid_guest_state(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run, u32 failure_reason) +{ + u16 ss, cs; + u8 opcodes[4]; + unsigned long rip = vcpu->arch.rip; + unsigned long rip_linear; + + ss = vmcs_read16(GUEST_SS_SELECTOR); + cs = vmcs_read16(GUEST_CS_SELECTOR); + + if ((ss & 0x03) != (cs & 0x03)) { + int err; + rip_linear = rip + vmx_get_segment_base(vcpu, VCPU_SREG_CS); + emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); +#if 0 + printk(KERN_INFO "emulation at (%lx) rip %lx: %02x %02x %02x %02x\n", + rip_linear, + rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); +#endif + err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + switch (err) { + case EMULATE_DONE: +#if 0 + printk(KERN_INFO "successfully emulated instruction\n"); +#endif + return 1; + case EMULATE_DO_MMIO: + printk(KERN_INFO "mmio?\n"); + return 0; + default: + kvm_report_emulation_failure(vcpu, "vmentry failure"); + break; + } + } + + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = failure_reason; + return 0; +} + +static int handle_vmentry_failure(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run, + u32 failure_reason) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); +#if 0 + printk(KERN_INFO "Failed vm entry (exit reason 0x%x) ", failure_reason); +#endif + switch (failure_reason) { + case EXIT_REASON_INVALID_GUEST_STATE: +#if 0 + printk("invalid guest state \n"); +#endif + return invalid_guest_state(vcpu, kvm_run, failure_reason); + case EXIT_REASON_MSR_LOADING: + printk("caused by MSR entry %ld loading.\n", exit_qualification); + break; + case EXIT_REASON_MACHINE_CHECK: + printk("caused by machine check.\n"); + break; + default: + printk("reason not known yet!\n"); + break; + } + return 0; +} /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -2694,6 +2763,12 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_EPT_VIOLATION)) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " "exit reason is 0x%x\n", __func__, exit_reason); + + if ((exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + exit_reason &= ~VMX_EXIT_REASONS_FAILED_VMENTRY; + return handle_vmentry_failure(vcpu, kvm_run, exit_reason); + } + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 79d94c6..2cebf48 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -238,7 +238,10 @@ enum vmcs_field { #define EXIT_REASON_IO_INSTRUCTION 30 #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_GUEST_STATE 33 +#define EXIT_REASON_MSR_LOADING 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MACHINE_CHECK 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EPT_VIOLATION 48 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 979f983..c84c5ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3044,8 +3044,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -static void get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { kvm_x86_ops->get_segment(vcpu, var, seg); } @@ -3128,8 +3128,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -static void set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { kvm_x86_ops->set_segment(vcpu, var, seg); } @@ -3287,8 +3287,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } -static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + int type_bits, int seg) { struct kvm_segment kvm_seg; diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 8a96320..581d18e 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -69,6 +69,7 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +int switch_perso = 0; enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, @@ -138,7 +139,8 @@ static u16 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, Group | Group1A, + DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, + DstReg | SrcMem | ModRM | Mov, Group | Group1A, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, @@ -152,7 +154,8 @@ static u16 opcode_table[256] = { ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | String, ImplicitOps | String, /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xC7 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, ImplicitOps | Stack, 0, 0, @@ -168,7 +171,7 @@ static u16 opcode_table[256] = { /* 0xE0 - 0xE7 */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, + ImplicitOps | Stack, SrcImm | ImplicitOps, ImplicitOps, SrcImmByte | ImplicitOps, 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, @@ -1246,6 +1249,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, default: break; } +#if 0 + if (switch_perso) { + printk(KERN_INFO " writeback: dst.byte %d\n" , c->dst.bytes); + printk(KERN_INFO " writeback: dst.ptr 0x%p\n" , c->dst.ptr); + printk(KERN_INFO " writeback: dst.val 0x%lx\n", c->dst.val); + printk(KERN_INFO " writeback: src.ptr 0x%p\n", c->src.ptr); + printk(KERN_INFO " writeback: src.val 0x%lx\n", c->src.val); + printk(KERN_INFO " writeback: RAX 0x%lx\n", c->regs[VCPU_REGS_RAX]); + printk(KERN_INFO " writeback: RSP 0x%lx\n", c->regs[VCPU_REGS_RSP]); + printk(KERN_INFO " writeback: CS 0x%lx\n", c->regs[VCPU_SREG_CS]); + printk(KERN_INFO " writeback: SS 0x%lx\n", c->regs[VCPU_SREG_SS]); + } +#endif return 0; } @@ -1342,6 +1358,10 @@ special_insn: switch (c->b) { case 0x00 ... 0x05: add: /* add */ + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + } emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; case 0x08 ... 0x0d: @@ -1514,14 +1534,90 @@ special_insn: break; case 0x88 ... 0x8b: /* mov */ goto mov; + case 0x8c: { /* mov r/m, sreg */ + struct kvm_segment segreg; + + if (c->modrm_mod == 0x3) + c->src.val = c->modrm_val; + + switch ( c->modrm_reg ) { + case 0: + get_segment(ctxt->vcpu, &segreg, VCPU_SREG_ES); + break; + case 1: + get_segment(ctxt->vcpu, &segreg, VCPU_SREG_CS); + break; + case 2: + get_segment(ctxt->vcpu, &segreg, VCPU_SREG_SS); + break; + case 3: + get_segment(ctxt->vcpu, &segreg, VCPU_SREG_DS); + break; + case 4: + get_segment(ctxt->vcpu, &segreg, VCPU_SREG_FS); + break; + case 5: + get_segment(ctxt->vcpu, &segreg, VCPU_SREG_GS); + break; + default: + printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + c->dst.val = segreg.selector; + c->dst.bytes = 2; + c->dst.ptr = (unsigned long *)decode_register(c->modrm_rm, c->regs, + c->d & ByteOp); + break; + } case 0x8d: /* lea r16/r32, m */ c->dst.val = c->modrm_ea; break; + case 0x8e: { /* mov seg, r/m16 */ + uint16_t sel; + + sel = c->src.val; + switch ( c->modrm_reg ) { + case 0: + if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_ES) < 0) + goto cannot_emulate; + break; + case 1: + if (load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) + goto cannot_emulate; + break; + case 2: + if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_SS) < 0) + goto cannot_emulate; + break; + case 3: + if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_DS) < 0) + goto cannot_emulate; + break; + case 4: + if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_FS) < 0) + goto cannot_emulate; + break; + case 5: + if (load_segment_descriptor(ctxt->vcpu, sel, 1, VCPU_SREG_GS) < 0) + goto cannot_emulate; + break; + default: + printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); if (rc != 0) goto done; break; + case 0xb8: /* mov r, imm */ + goto mov; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; emulate_push(ctxt); @@ -1623,6 +1719,10 @@ special_insn: DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; case 0xc0 ... 0xc1: + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + } emulate_grp2(ctxt); break; case 0xc3: /* ret */ @@ -1660,6 +1760,39 @@ special_insn: break; } case 0xe9: /* jmp rel */ + jmp_rel(c, c->src.val); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xea: /* jmp far */ { + uint32_t eip; + uint16_t sel; + + /* enable switch_perso */ + switch_perso = 1; + + switch (c->op_bytes) { + case 2: + eip = insn_fetch(u16, 2, c->eip); + eip = eip & 0x0000FFFF; /* clear upper 16 bits */ + break; + case 4: + eip = insn_fetch(u32, 4, c->eip); + break; + default: + DPRINTF("jmp far: Invalid op_bytes\n"); + goto cannot_emulate; + } + sel = insn_fetch(u16, 2, c->eip); + if (ctxt->mode == X86EMUL_MODE_REAL) + eip |= (sel << 4); + else if (load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) { + DPRINTF("jmp far: Failed to load CS descriptor\n"); + goto cannot_emulate; + } + + c->eip = eip; + break; + } case 0xeb: /* jmp rel short */ jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 1d8cd01..29254b4 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -495,6 +495,10 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); +void set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + int type_bits, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); |