From: Laurent V. <Lau...@bu...> - 2008-04-23 13:09:14
|
These two patches allow to batch writes to MMIO. When kernel has to send MMIO writes to userspace, it stores them in memory until it has to pass the hand to userspace for another reason. This avoids to have too many context switches on operations that can wait. WARNING: this breaks compatibility with old userspace part. Signed-off-by: Laurent Vivier <Lau...@bu...> [PATCH 1/2] kvm: Batch writes to MMIO - kernel part [PATCH 2/2] kvm-userspace: Batch writes to MMIO - userspace part Signed-off-by: Laurent Vivier <Lau...@bu...> |
From: Laurent V. <Lau...@bu...> - 2008-04-23 13:10:40
|
This patch is the kernel part of the "batch writes to MMIO" patch. When kernel has to send MMIO writes to userspace, it stores them in memory until it has to pass the hand to userspace for another reason. This avoids to have too many context switches on operations that can wait. WARNING: this breaks compatibility with old userspace part. Signed-off-by: Laurent Vivier <Lau...@bu...> --- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ include/asm-x86/kvm_host.h | 2 ++ include/linux/kvm.h | 10 +++++++++- virt/kvm/kvm_main.c | 3 +++ 4 files changed, 35 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ce5563..3881056 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2942,8 +2942,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_x86_ops->decache_regs(vcpu); } +batch: r = __vcpu_run(vcpu, kvm_run); + if (!r && vcpu->mmio_is_write && + kvm_run->exit_reason == KVM_EXIT_MMIO && + kvm_run->batch_count < KVM_MAX_BATCH) { + struct kvm_batch *batch = vcpu->arch.batch_data; + int i = kvm_run->batch_count++; + + batch[i].phys_addr = vcpu->mmio_phys_addr; + batch[i].len = vcpu->mmio_size; + memcpy(batch[i].data, vcpu->mmio_data, batch[i].len); + + goto batch; + } out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -3830,6 +3843,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->arch.batch_data = page_address(page); + r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -3857,6 +3877,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); up_read(&vcpu->kvm->slots_lock); free_page((unsigned long)vcpu->arch.pio_data); + free_page((unsigned long)vcpu->arch.batch_data); } struct kvm *kvm_arch_create_vm(void) diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 9d963cd..2824652 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -26,6 +26,7 @@ #define KVM_PRIVATE_MEM_SLOTS 4 #define KVM_PIO_PAGE_OFFSET 1 +#define KVM_MMIO_PAGE_OFFSET 2 #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) @@ -255,6 +256,7 @@ struct kvm_vcpu_arch { gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; + void *batch_data; struct kvm_queued_exception { bool pending; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index a281afe..cf0d266 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -94,7 +94,8 @@ struct kvm_run { __u32 exit_reason; __u8 ready_for_interrupt_injection; __u8 if_flag; - __u8 padding2[2]; + __u8 batch_count; + __u8 padding2; /* in (pre_kvm_run), out (post_kvm_run) */ __u64 cr8; @@ -173,6 +174,13 @@ struct kvm_run { }; }; +#define KVM_MAX_BATCH (PAGE_SIZE / sizeof(struct kvm_batch)) +struct kvm_batch { + __u64 phys_addr; + __u32 len; + __u8 data[8]; +}; + /* for KVM_TRANSLATE */ struct kvm_translation { /* in */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d3cb4cc..b2234b3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -796,6 +796,8 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) #ifdef CONFIG_X86 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) page = virt_to_page(vcpu->arch.pio_data); + else if (vmf->pgoff == KVM_MMIO_PAGE_OFFSET) + page = virt_to_page(vcpu->arch.batch_data); #endif else return VM_FAULT_SIGBUS; @@ -1214,6 +1216,7 @@ static long kvm_dev_ioctl(struct file *filp, r = PAGE_SIZE; /* struct kvm_run */ #ifdef CONFIG_X86 r += PAGE_SIZE; /* pio data page */ + r += PAGE_SIZE; /* mmio batch page */ #endif break; case KVM_TRACE_ENABLE: -- 1.5.2.4 |
From: Laurent V. <Lau...@bu...> - 2008-04-23 13:09:13
|
This patch is userspace part of the "batch writes to MMIO" patch. When kernel has to send MMIO writes to userspace, it stores them in memory until it has to pass the hand to userspace for another reason. This avoids too have to many context switches on operations that can wait. Signed-off-by: Laurent Vivier <Lau...@bu...> --- libkvm/libkvm.c | 17 +++++++++++++++++ 1 files changed, 17 insertions(+), 0 deletions(-) diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c index 329f29f..be74477 100644 --- a/libkvm/libkvm.c +++ b/libkvm/libkvm.c @@ -864,6 +864,10 @@ int kvm_run(kvm_context_t kvm, int vcpu) int r; int fd = kvm->vcpu_fd[vcpu]; struct kvm_run *run = kvm->run[vcpu]; +#if defined(__x86_64__) || defined(__i386__) + struct kvm_batch *batch = (void *)run + 2 * PAGE_SIZE; + int i; +#endif again: if (!kvm->irqchip_in_kernel) @@ -882,6 +886,19 @@ again: post_kvm_run(kvm, vcpu); +#if defined(__x86_64__) || defined(__i386__) + for (i = 0; i < run->batch_count; i++) { + if ((batch[i].phys_addr > 0xa0000-4 && + batch[i].phys_addr <= 0xa0000) && batch[i].len == 3) + continue; + kvm->callbacks->mmio_write(kvm->opaque, + batch[i].phys_addr, + &batch[i].data[0], batch[i].len); + + } + run->batch_count = 0; +#endif + if (r == -1) { r = handle_io_window(kvm); goto more; -- 1.5.2.4 |
From: Anthony L. <an...@co...> - 2008-04-23 16:53:35
|
Laurent Vivier wrote: > This patch is the kernel part of the "batch writes to MMIO" patch. > > When kernel has to send MMIO writes to userspace, it stores them > in memory until it has to pass the hand to userspace for another > reason. This avoids to have too many context switches on operations > that can wait. > > WARNING: this breaks compatibility with old userspace part. > > Signed-off-by: Laurent Vivier <Lau...@bu...> > --- > arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ > include/asm-x86/kvm_host.h | 2 ++ > include/linux/kvm.h | 10 +++++++++- > virt/kvm/kvm_main.c | 3 +++ > 4 files changed, 35 insertions(+), 1 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 0ce5563..3881056 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2942,8 +2942,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > kvm_x86_ops->decache_regs(vcpu); > } > > +batch: > r = __vcpu_run(vcpu, kvm_run); > > + if (!r && vcpu->mmio_is_write && > + kvm_run->exit_reason == KVM_EXIT_MMIO && > + kvm_run->batch_count < KVM_MAX_BATCH) { > + struct kvm_batch *batch = vcpu->arch.batch_data; > + int i = kvm_run->batch_count++; > + > + batch[i].phys_addr = vcpu->mmio_phys_addr; > + batch[i].len = vcpu->mmio_size; > + memcpy(batch[i].data, vcpu->mmio_data, batch[i].len); > + > + goto batch; > + } > I wonder if this is sufficient for dynticks enabled guests. __vcpu_run could last a very long time. Ignoring that for a moment, Avi's comment about having userspace tell you which addresses to batch is an important one. MMIO writes may have side effects and the next vcpu_run may rely on those side effects. For instance, MMIO based IRQ acknowledgement. You need a white list not only for performances purposes but also for correctness. Regards, Anthony Liguori |
From: Avi K. <av...@qu...> - 2008-04-23 14:09:26
|
Laurent Vivier wrote: > These two patches allow to batch writes to MMIO. > > When kernel has to send MMIO writes to userspace, it stores them > in memory until it has to pass the hand to userspace for another > reason. This avoids to have too many context switches on operations > that can wait. > > Did you obtain any measurable performance benefit? > WARNING: this breaks compatibility with old userspace part. > > So it's just an RFC :) -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Laurent V. <Lau...@bu...> - 2008-04-23 14:23:39
|
Le mercredi 23 avril 2008 à 17:05 +0300, Avi Kivity a écrit : > Laurent Vivier wrote: > > These two patches allow to batch writes to MMIO. > > > > When kernel has to send MMIO writes to userspace, it stores them > > in memory until it has to pass the hand to userspace for another > > reason. This avoids to have too many context switches on operations > > that can wait. > > > > > > Did you obtain any measurable performance benefit? Well, the problem is how to measure it. Really, I don't know. But when I add traces I saw MMIO writes are batched: by group of 170 (this is the max in a page) at the beginning, and by group of 10 with XP when we move a window. So all comments are welcome... > > WARNING: this breaks compatibility with old userspace part. > > > > > > So it's just an RFC :) Yes... to have some comments how to manage this :) Laurent -- ------------- Lau...@bu... --------------- "The best way to predict the future is to invent it." - Alan Kay |
From: Avi K. <av...@qu...> - 2008-04-23 14:44:09
|
Laurent Vivier wrote: > Le mercredi 23 avril 2008 à 17:05 +0300, Avi Kivity a écrit : > >> Laurent Vivier wrote: >> >>> These two patches allow to batch writes to MMIO. >>> >>> When kernel has to send MMIO writes to userspace, it stores them >>> in memory until it has to pass the hand to userspace for another >>> reason. This avoids to have too many context switches on operations >>> that can wait. >>> >>> >>> >> Did you obtain any measurable performance benefit? >> > > Well, the problem is how to measure it. Really, I don't know. > > But when I add traces I saw MMIO writes are batched: by group of 170 > (this is the max in a page) at the beginning, That's the lovely animation. You can time XP startup, or look at system time in 'top' once it stabilizes. Also compare total host_state_reloads to boot in kvm_stat (each costs 5 usec, after discount). > and by group of 10 with XP > when we move a window. > That's probably programming the cirrus blitter, also a candidate for batching, but probably not measurable. Maybe one of those Windows graphics benchmarks which draw tons of rectangles very fast and cause epilepsy seizures. I guess most interesting is e1000, if we need to do more than one mmio to start transmitting. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Anthony L. <an...@co...> - 2008-04-23 15:47:46
|
Avi Kivity wrote: > Anthony Liguori wrote: >> >> The ne2k is pretty mmio heavy. You should be able to observe a boost >> with something like iperf (guest=>host) I would think if this is a >> real savings. >> > > If we're just improving ne2k, the complexity isn't worth it. We have > two better nics which are widely supported in guests. Sure, but the ne2k should be pretty close to an optimal synthetic benchmark for batching. If we don't see a big improvement with it, we're probably not going to see a big improvement with it for anything. As you point out though, the converse isn't necessarily true. Regards, Anthony Liguori |
From: Avi K. <av...@qu...> - 2008-04-23 17:17:17
|
Anthony Liguori wrote: > Avi Kivity wrote: > >> Anthony Liguori wrote: >> >>> The ne2k is pretty mmio heavy. You should be able to observe a boost >>> with something like iperf (guest=>host) I would think if this is a >>> real savings. >>> >>> >> If we're just improving ne2k, the complexity isn't worth it. We have >> two better nics which are widely supported in guests. >> > > Sure, but the ne2k should be pretty close to an optimal synthetic > benchmark for batching. If we don't see a big improvement with it, > we're probably not going to see a big improvement with it for anything. > I disagree. If there are other inefficiencies in ne2k, they will mask any performance improvement from batching. So whatever result we get will be inconclusive. I don't expect any massive performance improvement except for splash screens. I'll be happy to settle for a few percent with e1000 or scsi or ide. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Avi K. <av...@qu...> - 2008-04-23 15:16:38
|
Anthony Liguori wrote: > > The ne2k is pretty mmio heavy. You should be able to observe a boost > with something like iperf (guest=>host) I would think if this is a > real savings. > If we're just improving ne2k, the complexity isn't worth it. We have two better nics which are widely supported in guests. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Avi K. <av...@qu...> - 2008-04-23 14:34:46
|
Laurent Vivier wrote: > This patch is the kernel part of the "batch writes to MMIO" patch. > > When kernel has to send MMIO writes to userspace, it stores them > in memory until it has to pass the hand to userspace for another > reason. This avoids to have too many context switches on operations > that can wait. > > WARNING: this breaks compatibility with old userspace part. > > Signed-off-by: Laurent Vivier <Lau...@bu...> > --- > arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ > include/asm-x86/kvm_host.h | 2 ++ > include/linux/kvm.h | 10 +++++++++- > virt/kvm/kvm_main.c | 3 +++ > 4 files changed, 35 insertions(+), 1 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 0ce5563..3881056 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2942,8 +2942,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > kvm_x86_ops->decache_regs(vcpu); > } > > +batch: > r = __vcpu_run(vcpu, kvm_run); > > + if (!r && vcpu->mmio_is_write && > + kvm_run->exit_reason == KVM_EXIT_MMIO && > + kvm_run->batch_count < KVM_MAX_BATCH) { > + struct kvm_batch *batch = vcpu->arch.batch_data; > + int i = kvm_run->batch_count++; > + > + batch[i].phys_addr = vcpu->mmio_phys_addr; > + batch[i].len = vcpu->mmio_size; > + memcpy(batch[i].data, vcpu->mmio_data, batch[i].len); > This breaks ordering on smp guests. batch_data needs to be a kvm thing, not a vcpu thing, and locked, of course. Also, you don't want to queue writes which trigger I/O since that will affect latency. Userspace should tell the kernel which mmio addresses are queuable. > + > + goto batch; > If you move this to within __vcpu_run, you won't need to loop. Maybe the best place is where we actually decide it's an mmio write. You also need to flush the queue each time you have an in-kernel mmio write. For example: vcpu0 vcpu1 mmio write (queued) apic write -> IPI doesn't see effects of write > + } > out: > if (vcpu->sigset_active) > sigprocmask(SIG_SETMASK, &sigsaved, NULL); > @@ -3830,6 +3843,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) > } > vcpu->arch.pio_data = page_address(page); > > + page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (!page) { > + r = -ENOMEM; > + goto fail; > + } > + vcpu->arch.batch_data = page_address(page); > + > r = kvm_mmu_create(vcpu); > if (r < 0) > goto fail_free_pio_data; > @@ -3857,6 +3877,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) > kvm_mmu_destroy(vcpu); > up_read(&vcpu->kvm->slots_lock); > free_page((unsigned long)vcpu->arch.pio_data); > + free_page((unsigned long)vcpu->arch.batch_data); > } > > #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) > #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) > @@ -255,6 +256,7 @@ struct kvm_vcpu_arch { > gva_t mmio_fault_cr2; > struct kvm_pio_request pio; > void *pio_data; > + void *batch_data; > > It's an array of structs, no? So it shouldn't be a void *. > > +#define KVM_MAX_BATCH (PAGE_SIZE / sizeof(struct kvm_batch)) > +struct kvm_batch { > + __u64 phys_addr; > + __u32 len; > + __u8 data[8]; > +}; > Size is 24 on 64-bit and 20 on 32-bit. Please pad (after len, so data is nicely aligned). -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Laurent V. <Lau...@bu...> - 2008-04-23 14:59:23
|
Le mercredi 23 avril 2008 à 17:31 +0300, Avi Kivity a écrit : > Laurent Vivier wrote: > > This patch is the kernel part of the "batch writes to MMIO" patch. > > > > When kernel has to send MMIO writes to userspace, it stores them > > in memory until it has to pass the hand to userspace for another > > reason. This avoids to have too many context switches on operations > > that can wait. > > > > WARNING: this breaks compatibility with old userspace part. > > > > Signed-off-by: Laurent Vivier <Lau...@bu...> > > --- > > arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ > > include/asm-x86/kvm_host.h | 2 ++ > > include/linux/kvm.h | 10 +++++++++- > > virt/kvm/kvm_main.c | 3 +++ > > 4 files changed, 35 insertions(+), 1 deletions(-) > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 0ce5563..3881056 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -2942,8 +2942,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > > kvm_x86_ops->decache_regs(vcpu); > > } > > > > +batch: > > r = __vcpu_run(vcpu, kvm_run); > > > > + if (!r && vcpu->mmio_is_write && > > + kvm_run->exit_reason == KVM_EXIT_MMIO && > > + kvm_run->batch_count < KVM_MAX_BATCH) { > > + struct kvm_batch *batch = vcpu->arch.batch_data; > > + int i = kvm_run->batch_count++; > > + > > + batch[i].phys_addr = vcpu->mmio_phys_addr; > > + batch[i].len = vcpu->mmio_size; > > + memcpy(batch[i].data, vcpu->mmio_data, batch[i].len); > > > > This breaks ordering on smp guests. batch_data needs to be a kvm thing, > not a vcpu thing, and locked, of course. - is ordering between vcpu important when we already delay operations ? - using vcpu avoids the lock - Why PIO (pio_data) are vcpu things and not kvm things, then ? > Also, you don't want to queue writes which trigger I/O since that will > affect latency. Userspace should tell the kernel which mmio addresses > are queuable. I agree (but in my first patch it was easier to ignore this) > > + > > + goto batch; > > > > If you move this to within __vcpu_run, you won't need to loop. Maybe > the best place is where we actually decide it's an mmio write. I agree > You also need to flush the queue each time you have an in-kernel mmio > write. For example: > > vcpu0 vcpu1 > > mmio write (queued) > apic write -> IPI > doesn't see effects of write I agree > > + } > > out: > > if (vcpu->sigset_active) > > sigprocmask(SIG_SETMASK, &sigsaved, NULL); > > @@ -3830,6 +3843,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) > > } > > vcpu->arch.pio_data = page_address(page); > > > > + page = alloc_page(GFP_KERNEL | __GFP_ZERO); > > + if (!page) { > > + r = -ENOMEM; > > + goto fail; > > + } > > + vcpu->arch.batch_data = page_address(page); > > + > > r = kvm_mmu_create(vcpu); > > if (r < 0) > > goto fail_free_pio_data; > > @@ -3857,6 +3877,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) > > kvm_mmu_destroy(vcpu); > > up_read(&vcpu->kvm->slots_lock); > > free_page((unsigned long)vcpu->arch.pio_data); > > + free_page((unsigned long)vcpu->arch.batch_data); > > } > > > > #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) > > #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) > > @@ -255,6 +256,7 @@ struct kvm_vcpu_arch { > > gva_t mmio_fault_cr2; > > struct kvm_pio_request pio; > > void *pio_data; > > + void *batch_data; > > > > > > It's an array of structs, no? So it shouldn't be a void *. Yes (I love cut&paste...) > > > > +#define KVM_MAX_BATCH (PAGE_SIZE / sizeof(struct kvm_batch)) > > +struct kvm_batch { > > + __u64 phys_addr; > > + __u32 len; > > + __u8 data[8]; > > +}; > > > > Size is 24 on 64-bit and 20 on 32-bit. Please pad (after len, so data > is nicely aligned). OK Thank you, Laurent -- ------------- Lau...@bu... --------------- "The best way to predict the future is to invent it." - Alan Kay |
From: Avi K. <av...@qu...> - 2008-04-23 15:08:24
|
Laurent Vivier wrote: >>> >>> >> This breaks ordering on smp guests. batch_data needs to be a kvm thing, >> not a vcpu thing, and locked, of course. >> > > - is ordering between vcpu important when we already delay operations ? > Yes. Ordering is very different from delaying. Also, we will only delay writes which don't matter (those that simply latch something into a register, which is later read). > - using vcpu avoids the lock > Correctness beats performance > - Why PIO (pio_data) are vcpu things and not kvm things, then ? > pio batching is used for just a single instruction that can work with batches (ins/outs), so a guest can't expect to see partial output. With mmio, you can have 1. mmio 2. raise flag in shared memory 3. other vcpu sees flag, does mmio 4. spin-wait for ack from other vcpu 5. mmio You can't do that with ins/outs. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Anthony L. <an...@co...> - 2008-04-23 15:10:48
|
Avi Kivity wrote: > Laurent Vivier wrote: > > That's the lovely animation. You can time XP startup, or look at system > time in 'top' once it stabilizes. Also compare total host_state_reloads > to boot in kvm_stat (each costs 5 usec, after discount). > Win2k's boot up animation (the moving bar) is done in VGA planar mode. You should be able to time the difference in startup. >> and by group of 10 with XP >> when we move a window. >> >> > > That's probably programming the cirrus blitter, also a candidate for > batching, but probably not measurable. Maybe one of those Windows > graphics benchmarks which draw tons of rectangles very fast and cause > epilepsy seizures. > > I guess most interesting is e1000, if we need to do more than one mmio > to start transmitting. > The ne2k is pretty mmio heavy. You should be able to observe a boost with something like iperf (guest=>host) I would think if this is a real savings. Regards, Anthony Liguori |
From: Laurent V. <Lau...@bu...> - 2008-04-23 16:24:01
|
Le mercredi 23 avril 2008 à 10:10 -0500, Anthony Liguori a écrit : [...] > The ne2k is pretty mmio heavy. You should be able to observe a boost > with something like iperf (guest=>host) I would think if this is a real > savings. I like your advices :-D I use iperf with e1000 emulation and a slightly modified patch (to detect MMIO write in a loop), server is on the host, client on the guest, with default values. RESULT WITHOUT BATCHING: [ 4] 0.0-10.0 sec 235 MBytes 197 Mbits/sec [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec [ 4] 0.0-10.0 sec 185 MBytes 155 Mbits/sec [ 5] 0.0-10.0 sec 227 MBytes 190 Mbits/sec [ 4] 0.0-10.0 sec 196 MBytes 164 Mbits/sec [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec [ 4] 0.0-10.0 sec 184 MBytes 154 Mbits/sec RESULT WITH BATCHING: ------------------------------------------------------------ Server listening on TCP port 5001 TCP window size: 85.3 KByte (default) ------------------------------------------------------------ [ 4] 0.0-10.0 sec 357 MBytes 299 Mbits/sec [ 5] 0.0-10.1 sec 418 MBytes 347 Mbits/sec [ 4] 0.0-10.0 sec 408 MBytes 342 Mbits/sec [ 5] 0.0-10.0 sec 422 MBytes 353 Mbits/sec [ 4] 0.0-10.1 sec 436 MBytes 362 Mbits/sec [ 5] 0.0-10.0 sec 416 MBytes 348 Mbits/sec [ 4] 0.0-10.0 sec 431 MBytes 361 Mbits/sec Well, it's nice ? Laurent -- ------------- Lau...@bu... --------------- "The best way to predict the future is to invent it." - Alan Kay |
From: Avi K. <av...@qu...> - 2008-04-23 16:29:44
|
Laurent Vivier wrote: > Le mercredi 23 avril 2008 à 10:10 -0500, Anthony Liguori a écrit : > [...] > >> The ne2k is pretty mmio heavy. You should be able to observe a boost >> with something like iperf (guest=>host) I would think if this is a real >> savings. >> > > I like your advices :-D > > I use iperf with e1000 emulation and a slightly modified patch (to > detect MMIO write in a loop), server is on the host, client on the > guest, with default values. > > RESULT WITHOUT BATCHING: > > [ 4] 0.0-10.0 sec 235 MBytes 197 Mbits/sec > [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec > [ 4] 0.0-10.0 sec 185 MBytes 155 Mbits/sec > [ 5] 0.0-10.0 sec 227 MBytes 190 Mbits/sec > [ 4] 0.0-10.0 sec 196 MBytes 164 Mbits/sec > [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec > [ 4] 0.0-10.0 sec 184 MBytes 154 Mbits/sec > > RESULT WITH BATCHING: > > ------------------------------------------------------------ > Server listening on TCP port 5001 > TCP window size: 85.3 KByte (default) > ------------------------------------------------------------ > [ 4] 0.0-10.0 sec 357 MBytes 299 Mbits/sec > [ 5] 0.0-10.1 sec 418 MBytes 347 Mbits/sec > [ 4] 0.0-10.0 sec 408 MBytes 342 Mbits/sec > [ 5] 0.0-10.0 sec 422 MBytes 353 Mbits/sec > [ 4] 0.0-10.1 sec 436 MBytes 362 Mbits/sec > [ 5] 0.0-10.0 sec 416 MBytes 348 Mbits/sec > [ 4] 0.0-10.0 sec 431 MBytes 361 Mbits/sec > > Well, it's nice ? > It's too good to be true. I think we're seeing two bugs cancel each other out, resulting in a performance gain. Linux doesn't know how to queue outgoing packets, so it bangs on the mmio that starts the transmit after every packet. mmio batching doesn't know that this mmio register is critical for latency, so it queues it up. The result is that you you get not just mmio batching, but also packet batching! Which dramatically improves performace at the expense of latency. Sorry (if it's true :) -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Laurent V. <Lau...@bu...> - 2008-04-23 16:40:59
|
Le mercredi 23 avril 2008 à 19:25 +0300, Avi Kivity a écrit : > Laurent Vivier wrote: > > Le mercredi 23 avril 2008 à 10:10 -0500, Anthony Liguori a écrit : > > [...] > > > >> The ne2k is pretty mmio heavy. You should be able to observe a boost > >> with something like iperf (guest=>host) I would think if this is a real > >> savings. > >> > > > > I like your advices :-D > > > > I use iperf with e1000 emulation and a slightly modified patch (to > > detect MMIO write in a loop), server is on the host, client on the > > guest, with default values. > > > > RESULT WITHOUT BATCHING: > > > > [ 4] 0.0-10.0 sec 235 MBytes 197 Mbits/sec > > [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec > > [ 4] 0.0-10.0 sec 185 MBytes 155 Mbits/sec > > [ 5] 0.0-10.0 sec 227 MBytes 190 Mbits/sec > > [ 4] 0.0-10.0 sec 196 MBytes 164 Mbits/sec > > [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec > > [ 4] 0.0-10.0 sec 184 MBytes 154 Mbits/sec > > > > RESULT WITH BATCHING: > > > > ------------------------------------------------------------ > > Server listening on TCP port 5001 > > TCP window size: 85.3 KByte (default) > > ------------------------------------------------------------ > > [ 4] 0.0-10.0 sec 357 MBytes 299 Mbits/sec > > [ 5] 0.0-10.1 sec 418 MBytes 347 Mbits/sec > > [ 4] 0.0-10.0 sec 408 MBytes 342 Mbits/sec > > [ 5] 0.0-10.0 sec 422 MBytes 353 Mbits/sec > > [ 4] 0.0-10.1 sec 436 MBytes 362 Mbits/sec > > [ 5] 0.0-10.0 sec 416 MBytes 348 Mbits/sec > > [ 4] 0.0-10.0 sec 431 MBytes 361 Mbits/sec > > > > Well, it's nice ? > > > > It's too good to be true. > > I think we're seeing two bugs cancel each other out, resulting in a > performance gain. Linux doesn't know how to queue outgoing packets, so > it bangs on the mmio that starts the transmit after every packet. mmio > batching doesn't know that this mmio register is critical for latency, > so it queues it up. The result is that you you get not just mmio > batching, but also packet batching! Which dramatically improves > performace at the expense of latency. How can I check that ? How can I measure latency ? Perhaps I can swap server and client between guest and host ? > > Sorry (if it's true :) > Thank you for your help, Laurent -- ------------- Lau...@bu... --------------- "The best way to predict the future is to invent it." - Alan Kay |
From: Anthony L. <an...@co...> - 2008-04-23 16:49:13
|
Laurent Vivier wrote: > Le mercredi 23 avril 2008 à 19:25 +0300, Avi Kivity a écrit : > >> Laurent Vivier wrote: >> >>> Le mercredi 23 avril 2008 à 10:10 -0500, Anthony Liguori a écrit : >>> [...] >>> >>> >>>> The ne2k is pretty mmio heavy. You should be able to observe a boost >>>> with something like iperf (guest=>host) I would think if this is a real >>>> savings. >>>> >>>> >>> I like your advices :-D >>> >>> I use iperf with e1000 emulation and a slightly modified patch (to >>> detect MMIO write in a loop), server is on the host, client on the >>> guest, with default values. >>> >>> RESULT WITHOUT BATCHING: >>> >>> [ 4] 0.0-10.0 sec 235 MBytes 197 Mbits/sec >>> [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec >>> [ 4] 0.0-10.0 sec 185 MBytes 155 Mbits/sec >>> [ 5] 0.0-10.0 sec 227 MBytes 190 Mbits/sec >>> [ 4] 0.0-10.0 sec 196 MBytes 164 Mbits/sec >>> [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec >>> [ 4] 0.0-10.0 sec 184 MBytes 154 Mbits/sec >>> >>> RESULT WITH BATCHING: >>> >>> ------------------------------------------------------------ >>> Server listening on TCP port 5001 >>> TCP window size: 85.3 KByte (default) >>> ------------------------------------------------------------ >>> [ 4] 0.0-10.0 sec 357 MBytes 299 Mbits/sec >>> [ 5] 0.0-10.1 sec 418 MBytes 347 Mbits/sec >>> [ 4] 0.0-10.0 sec 408 MBytes 342 Mbits/sec >>> [ 5] 0.0-10.0 sec 422 MBytes 353 Mbits/sec >>> [ 4] 0.0-10.1 sec 436 MBytes 362 Mbits/sec >>> [ 5] 0.0-10.0 sec 416 MBytes 348 Mbits/sec >>> [ 4] 0.0-10.0 sec 431 MBytes 361 Mbits/sec >>> >>> Well, it's nice ? >>> >>> >> It's too good to be true. >> >> I think we're seeing two bugs cancel each other out, resulting in a >> performance gain. Linux doesn't know how to queue outgoing packets, so >> it bangs on the mmio that starts the transmit after every packet. mmio >> batching doesn't know that this mmio register is critical for latency, >> so it queues it up. The result is that you you get not just mmio >> batching, but also packet batching! Which dramatically improves >> performace at the expense of latency. >> > > How can I check that ? How can I measure latency ? > ping (from guest to host) Regards, Anthony Liguori > Perhaps I can swap server and client between guest and host ? > > >> Sorry (if it's true :) >> >> > > Thank you for your help, > Laurent > |
From: Avi K. <av...@qu...> - 2008-04-23 16:54:59
|
Anthony Liguori wrote: >> >> How can I check that ? How can I measure latency ? >> > > ping (from guest to host) > The guest will halt anyway, flushing its mmio queue. Perhaps a ping while a background process spins, consuming all cpu. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. |
From: Laurent V. <Lau...@bu...> - 2008-04-23 18:54:37
|
Le mercredi 23 avril 2008 à 11:48 -0500, Anthony Liguori a écrit : > Laurent Vivier wrote: > > Le mercredi 23 avril 2008 à 19:25 +0300, Avi Kivity a écrit : > > > >> Laurent Vivier wrote: > >> > >>> Le mercredi 23 avril 2008 à 10:10 -0500, Anthony Liguori a écrit : > >>> [...] > >>> > >>> > >>>> The ne2k is pretty mmio heavy. You should be able to observe a boost > >>>> with something like iperf (guest=>host) I would think if this is a real > >>>> savings. > >>>> > >>>> > >>> I like your advices :-D > >>> > >>> I use iperf with e1000 emulation and a slightly modified patch (to > >>> detect MMIO write in a loop), server is on the host, client on the > >>> guest, with default values. > >>> > >>> RESULT WITHOUT BATCHING: > >>> > >>> [ 4] 0.0-10.0 sec 235 MBytes 197 Mbits/sec > >>> [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec > >>> [ 4] 0.0-10.0 sec 185 MBytes 155 Mbits/sec > >>> [ 5] 0.0-10.0 sec 227 MBytes 190 Mbits/sec > >>> [ 4] 0.0-10.0 sec 196 MBytes 164 Mbits/sec > >>> [ 5] 0.0-10.0 sec 194 MBytes 163 Mbits/sec > >>> [ 4] 0.0-10.0 sec 184 MBytes 154 Mbits/sec > >>> > >>> RESULT WITH BATCHING: > >>> > >>> ------------------------------------------------------------ > >>> Server listening on TCP port 5001 > >>> TCP window size: 85.3 KByte (default) > >>> ------------------------------------------------------------ > >>> [ 4] 0.0-10.0 sec 357 MBytes 299 Mbits/sec > >>> [ 5] 0.0-10.1 sec 418 MBytes 347 Mbits/sec > >>> [ 4] 0.0-10.0 sec 408 MBytes 342 Mbits/sec > >>> [ 5] 0.0-10.0 sec 422 MBytes 353 Mbits/sec > >>> [ 4] 0.0-10.1 sec 436 MBytes 362 Mbits/sec > >>> [ 5] 0.0-10.0 sec 416 MBytes 348 Mbits/sec > >>> [ 4] 0.0-10.0 sec 431 MBytes 361 Mbits/sec > >>> > >>> Well, it's nice ? > >>> > >>> > >> It's too good to be true. > >> > >> I think we're seeing two bugs cancel each other out, resulting in a > >> performance gain. Linux doesn't know how to queue outgoing packets, so > >> it bangs on the mmio that starts the transmit after every packet. mmio > >> batching doesn't know that this mmio register is critical for latency, > >> so it queues it up. The result is that you you get not just mmio > >> batching, but also packet batching! Which dramatically improves > >> performace at the expense of latency. > >> > > > > How can I check that ? How can I measure latency ? > > > > ping (from guest to host) I have 40 ms instead of 0.09 ms, so Avi you are right. Laurent -- ------------- Lau...@bu... --------------- "The best way to predict the future is to invent it." - Alan Kay |
From: Laurent V. <Lau...@bu...> - 2008-05-15 13:51:31
|
These two patches allow to batch writes to MMIO. When kernel has to send MMIO writes to userspace, it stores them in memory until it has to pass the hand to userspace for another reason. This avoids to have too many context switches on operations that can wait. These patches introduce an ioctl() to define MMIO allowed to be delayed. I made some bentchmark with iperf and e1000: average on 10 runs WITH WITHOUT PATCH PATCH 257.2 MB/s 193.7 MB/s 33% faster I've measured host_state_reload on WinXP boot: WITH WITHOUT PATCH PATCH 561397 739708 24% less I've measured host_state_reload on a VGA text scroll: WITH WITHOUT PATCH PATCH 3976242 13779849 70% less... [PATCH 1/2] kvm: Batch writes to MMIO - kernel part [PATCH 2/2] kvm-userspace: Batch writes to MMIO - userspace part Signed-off-by: Laurent Vivier <Lau...@bu...> |
From: Laurent V. <Lau...@bu...> - 2008-05-15 13:52:13
|
This patch is the kernel part of the "batch writes to MMIO" patch. It intoduces the ioctl interface to define MMIO zone it is allowed to delay. Inside a zone, we can define sub-part we must not delay. If an MMIO can be delayed, it is stored in a ring buffer which common for all VCPUs. Signed-off-by: Laurent Vivier <Lau...@bu...> --- arch/x86/kvm/x86.c | 172 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/kvm.h | 7 ++ include/asm-x86/kvm_host.h | 23 ++++++ include/linux/kvm.h | 16 ++++ virt/kvm/kvm_main.c | 3 + 5 files changed, 221 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dab3d4f..930986b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1518,6 +1518,103 @@ out: return r; } +static struct kvm_delayed_mmio_zone *kvm_mmio_find_zone(struct kvm *kvm, + u64 addr, u32 size) +{ + int i; + struct kvm_delayed_mmio_zone *zone; + + for (i = 0; i < kvm->arch.nb_mmio_zones; i++) { + zone = &kvm->arch.mmio_zone[i]; + + /* (addr,size) is fully included in + * (zone->addr, zone->size) + */ + + if (zone->addr <= addr && + addr + size <= zone->addr + zone->size) + return zone; + } + return NULL; +} + +static struct kvm_excluded_mmio_zone * +kvm_mmio_find_excluded(struct kvm_delayed_mmio_zone *zone, u64 addr, u32 size) +{ + static struct kvm_excluded_mmio_zone *excluded; + int i; + + addr -= zone->addr; + for (i = 0; i < zone->nb_excluded_zones; i++) { + excluded = &zone->excluded[i]; + + if ((excluded->offset <= addr && + addr < excluded->offset + excluded->size) || + (excluded->offset < addr + size && + addr + size <= excluded->offset + + excluded->size)) + return excluded; + } + return NULL; +} + +static int kvm_is_delayed_mmio(struct kvm *kvm, u64 addr, u32 size) +{ + struct kvm_delayed_mmio_zone *zone; + struct kvm_excluded_mmio_zone *excluded; + + zone = kvm_mmio_find_zone(kvm, addr, size); + if (zone == NULL) + return 0; /* not a delayed MMIO address */ + + excluded = kvm_mmio_find_excluded(zone, addr, size); + return excluded == NULL; +} + +static int kvm_vm_ioctl_set_mmio(struct kvm *kvm, + struct kvm_mmio_zone *zone) +{ + struct kvm_delayed_mmio_zone *z; + + if (zone->is_delayed && + kvm->arch.nb_mmio_zones >= KVM_MAX_DELAYED_MMIO_ZONE) + return -ENOMEM; + + if (zone->is_delayed) { + + /* already defined ? */ + + if (kvm_mmio_find_zone(kvm, zone->addr, 1) || + kvm_mmio_find_zone(kvm, zone->addr + zone->size - 1, 1)) + return 0; + + z = &kvm->arch.mmio_zone[kvm->arch.nb_mmio_zones]; + z->addr = zone->addr; + z->size = zone->size; + kvm->arch.nb_mmio_zones++; + return 0; + } + + /* exclude some parts of the delayed MMIO zone */ + + z = kvm_mmio_find_zone(kvm, zone->addr, zone->size); + if (z == NULL) + return -EINVAL; + + if (z->nb_excluded_zones >= KVM_MAX_EXCLUDED_MMIO_ZONE) + return -ENOMEM; + + if (kvm_mmio_find_excluded(z, zone->addr, 1) || + kvm_mmio_find_excluded(z, zone->addr + zone->size - 1, 1)) + return 0; + + z->excluded[z->nb_excluded_zones].offset = zone->addr - z->addr; + z->excluded[z->nb_excluded_zones].size = zone->size; + z->nb_excluded_zones++; + + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1671,6 +1768,18 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_SET_MMIO: { + struct kvm_mmio_zone zone; + r = -EFAULT; + if (copy_from_user(&zone, argp, sizeof zone)) + goto out; + r = -ENXIO; + r = kvm_vm_ioctl_set_mmio(kvm, &zone); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -2706,6 +2815,52 @@ static void vapic_exit(struct kvm_vcpu *vcpu) mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); } +static int batch_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_batch *batch = vcpu->kvm->arch.batch; + spinlock_t *lock = &vcpu->kvm->arch.batch_lock; + int next; + + /* check if this MMIO can be delayed */ + + if (!kvm_is_delayed_mmio(vcpu->kvm, + vcpu->mmio_phys_addr, vcpu->mmio_size)) + return 0; + + /* check if ring is full + * we have no lock on "first" + * as it can only increase we can only have + * a false "full". + */ + + spin_lock(lock); + + /* last is the first free entry + * check if we don't meet the first used entry + * there is always one unused entry in the buffer + */ + + next = (batch->last + 1) % KVM_MAX_BATCH; + if (next == batch->first) { + /* full */ + spin_unlock(lock); + return 0; + } + + /* batch it */ + + /* copy data in first free entry of the ring */ + + batch->mmio[batch->last].phys_addr = vcpu->mmio_phys_addr; + batch->mmio[batch->last].len = vcpu->mmio_size; + memcpy(batch->mmio[batch->last].data, vcpu->mmio_data, vcpu->mmio_size); + batch->last = next; + + spin_unlock(lock); + + return 1; +} + static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -2857,6 +3012,11 @@ again: goto again; } + if (!r && + vcpu->mmio_is_write && kvm_run->exit_reason == KVM_EXIT_MMIO + && !need_resched() && batch_mmio(vcpu)) + goto again; + out: up_read(&vcpu->kvm->slots_lock); if (r > 0) { @@ -3856,12 +4016,22 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + struct page *page; if (!kvm) return ERR_PTR(-ENOMEM); + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.batch_lock = __SPIN_LOCK_UNLOCKED(batch_lock); + kvm->arch.batch = (struct kvm_batch *)page_address(page); + return kvm; } @@ -3902,6 +4072,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + if (kvm->arch.batch) + free_page((unsigned long)kvm->arch.batch); kfree(kvm); } diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h index 6f18408..3c4a611 100644 --- a/include/asm-x86/kvm.h +++ b/include/asm-x86/kvm.h @@ -209,6 +209,13 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; +struct kvm_mmio_zone { + __u8 is_delayed; + __u8 pad[3]; + __u32 size; + __u64 addr; +}; + #define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) #define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) #define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 1466c3f..df42cdb 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -26,6 +26,7 @@ #define KVM_PRIVATE_MEM_SLOTS 4 #define KVM_PIO_PAGE_OFFSET 1 +#define KVM_MMIO_PAGE_OFFSET 2 #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) @@ -293,6 +294,21 @@ struct kvm_mem_alias { gfn_t target_gfn; }; +#define KVM_MAX_DELAYED_MMIO_ZONE 10 +#define KVM_MAX_EXCLUDED_MMIO_ZONE 10 + +struct kvm_excluded_mmio_zone { + u32 offset; + u32 size; +}; + +struct kvm_delayed_mmio_zone { + u64 addr; + u32 size; + u32 nb_excluded_zones; + struct kvm_excluded_mmio_zone excluded[KVM_MAX_EXCLUDED_MMIO_ZONE]; +}; + struct kvm_arch{ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; @@ -317,6 +333,13 @@ struct kvm_arch{ struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; + + /* MMIO batch */ + + spinlock_t batch_lock; + struct kvm_batch *batch; + int nb_mmio_zones; + struct kvm_delayed_mmio_zone mmio_zone[KVM_MAX_DELAYED_MMIO_ZONE]; }; struct kvm_vm_stat { diff --git a/include/linux/kvm.h b/include/linux/kvm.h index a281afe..b57010d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -173,6 +173,21 @@ struct kvm_run { }; }; +struct kvm_mmio { + __u64 phys_addr; + __u32 len; + __u32 pad; + __u8 data[8]; +}; + +struct kvm_batch { + __u32 first, last; + struct kvm_mmio mmio[0]; +}; + +#define KVM_MAX_BATCH ((PAGE_SIZE - sizeof(struct kvm_batch)) / \ + sizeof(struct kvm_mmio)) + /* for KVM_TRANSLATE */ struct kvm_translation { /* in */ @@ -371,6 +386,7 @@ struct kvm_trace_rec { #define KVM_CREATE_PIT _IO(KVMIO, 0x64) #define KVM_GET_PIT _IOWR(KVMIO, 0x65, struct kvm_pit_state) #define KVM_SET_PIT _IOR(KVMIO, 0x66, struct kvm_pit_state) +#define KVM_SET_MMIO _IOW(KVMIO, 0x67, struct kvm_mmio_zone) /* * ioctls for vcpu fds diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64ed402..c8f1bdf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -824,6 +824,8 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) #ifdef CONFIG_X86 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) page = virt_to_page(vcpu->arch.pio_data); + else if (vmf->pgoff == KVM_MMIO_PAGE_OFFSET) + page = virt_to_page(vcpu->kvm->arch.batch); #endif else return VM_FAULT_SIGBUS; @@ -1230,6 +1232,7 @@ static long kvm_dev_ioctl(struct file *filp, r = PAGE_SIZE; /* struct kvm_run */ #ifdef CONFIG_X86 r += PAGE_SIZE; /* pio data page */ + r += PAGE_SIZE; /* mmio batch page */ #endif break; case KVM_TRACE_ENABLE: -- 1.5.2.4 |
From: Laurent V. <Lau...@bu...> - 2008-05-15 13:51:39
|
This patch is userspace part of the "batch writes to MMIO" patch. It defines delayed MMIO zone using kvm_set_mmio() (for VGA and e1000). It empties the ring buffer and process the MMIO accesses. Signed-off-by: Laurent Vivier <Lau...@bu...> --- libkvm/libkvm-x86.c | 18 ++++++++++++++++++ libkvm/libkvm.c | 13 +++++++++++++ libkvm/libkvm.h | 2 ++ qemu/hw/cirrus_vga.c | 2 ++ qemu/hw/e1000.c | 8 ++++++++ qemu/hw/vga.c | 4 ++++ qemu/qemu-kvm.c | 6 ++++++ qemu/qemu-kvm.h | 2 ++ 8 files changed, 55 insertions(+), 0 deletions(-) diff --git a/libkvm/libkvm-x86.c b/libkvm/libkvm-x86.c index d46fdcc..911e079 100644 --- a/libkvm/libkvm-x86.c +++ b/libkvm/libkvm-x86.c @@ -391,6 +391,24 @@ int kvm_set_pit(kvm_context_t kvm, struct kvm_pit_state *s) #endif +int kvm_set_mmio(kvm_context_t kvm, + uint8_t is_delayed, uint64_t addr, uint32_t size) +{ + struct kvm_mmio_zone zone; + int r; + + zone.is_delayed = is_delayed; + zone.addr = addr; + zone.size = size; + + r = ioctl(kvm->vm_fd, KVM_SET_MMIO, &zone); + if (r == -1) { + r = -errno; + perror("kvm_set_mmio"); + } + return r; +} + void kvm_show_code(kvm_context_t kvm, int vcpu) { #define SHOW_CODE_LEN 50 diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c index d1e95a4..b891630 100644 --- a/libkvm/libkvm.c +++ b/libkvm/libkvm.c @@ -861,6 +861,9 @@ int kvm_run(kvm_context_t kvm, int vcpu) int r; int fd = kvm->vcpu_fd[vcpu]; struct kvm_run *run = kvm->run[vcpu]; +#if defined(__x86_64__) || defined(__i386__) + struct kvm_batch *batch = (void *)run + 2 * PAGE_SIZE; +#endif again: if (!kvm->irqchip_in_kernel) @@ -879,6 +882,16 @@ again: post_kvm_run(kvm, vcpu); +#if defined(__x86_64__) || defined(__i386__) + while (batch->first != batch->last) { + kvm->callbacks->mmio_write(kvm->opaque, + batch->mmio[batch->first].phys_addr, + &batch->mmio[batch->first].data[0], + batch->mmio[batch->first].len); + batch->first = (batch->first + 1) % KVM_MAX_BATCH; + } +#endif + if (r == -1) { r = handle_io_window(kvm); goto more; diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h index 31c0d59..1f453e1 100644 --- a/libkvm/libkvm.h +++ b/libkvm/libkvm.h @@ -448,6 +448,8 @@ int kvm_get_dirty_pages_range(kvm_context_t kvm, unsigned long phys_addr, unsigned long end_addr, void *buf, void*opaque, int (*cb)(unsigned long start, unsigned long len, void*bitmap, void *opaque)); +int kvm_set_mmio(kvm_context_t kvm, + uint8_t is_delayed, uint64_t addr, uint32_t size); /*! * \brief Create a memory alias diff --git a/qemu/hw/cirrus_vga.c b/qemu/hw/cirrus_vga.c index 2c4aeec..4ef8085 100644 --- a/qemu/hw/cirrus_vga.c +++ b/qemu/hw/cirrus_vga.c @@ -3291,6 +3291,8 @@ static void cirrus_init_common(CirrusVGAState * s, int device_id, int is_pci) cirrus_vga_mem_write, s); cpu_register_physical_memory(isa_mem_base + 0x000a0000, 0x20000, vga_io_memory); + if (kvm_enabled()) + qemu_kvm_set_mmio(1, isa_mem_base + 0x000a0000, 0x20000); s->sr[0x06] = 0x0f; if (device_id == CIRRUS_ID_CLGD5446) { diff --git a/qemu/hw/e1000.c b/qemu/hw/e1000.c index 0728539..d223631 100644 --- a/qemu/hw/e1000.c +++ b/qemu/hw/e1000.c @@ -26,6 +26,7 @@ #include "hw.h" #include "pci.h" #include "net.h" +#include "qemu-kvm.h" #include "e1000_hw.h" @@ -938,6 +939,13 @@ e1000_mmio_map(PCIDevice *pci_dev, int region_num, d->mmio_base = addr; cpu_register_physical_memory(addr, PNPMMIO_SIZE, d->mmio_index); + + if (kvm_enabled()) { + qemu_kvm_set_mmio(1, addr, PNPMMIO_SIZE); + qemu_kvm_set_mmio(0, addr + E1000_TCTL, 4); + qemu_kvm_set_mmio(0, addr + E1000_TDT, 4); + qemu_kvm_set_mmio(0, addr + E1000_ICR, 4); + } } static int diff --git a/qemu/hw/vga.c b/qemu/hw/vga.c index 3a49573..844c2a7 100644 --- a/qemu/hw/vga.c +++ b/qemu/hw/vga.c @@ -2257,6 +2257,8 @@ void vga_init(VGAState *s) vga_io_memory = cpu_register_io_memory(0, vga_mem_read, vga_mem_write, s); cpu_register_physical_memory(isa_mem_base + 0x000a0000, 0x20000, vga_io_memory); + if (kvm_enabled()) + qemu_kvm_set_mmio(1, isa_mem_base + 0x000a0000, 0x20000); } /* Memory mapped interface */ @@ -2332,6 +2334,8 @@ static void vga_mm_init(VGAState *s, target_phys_addr_t vram_base, cpu_register_physical_memory(ctrl_base, 0x100000, s_ioport_ctrl); s->bank_offset = 0; cpu_register_physical_memory(vram_base + 0x000a0000, 0x20000, vga_io_memory); + if (kvm_enabled()) + qemu_kvm_set_mmio(1, vram_base + 0x000a0000, 0x20000); } int isa_vga_init(DisplayState *ds, uint8_t *vga_ram_base, diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c index e522269..9f03ab1 100644 --- a/qemu/qemu-kvm.c +++ b/qemu/qemu-kvm.c @@ -955,3 +955,9 @@ void kvm_mutex_lock(void) pthread_mutex_lock(&qemu_mutex); cpu_single_env = NULL; } + +int qemu_kvm_set_mmio(int is_delayed, + target_phys_addr_t addr, unsigned int size) +{ + return kvm_set_mmio(kvm_context, is_delayed, addr, size); +} diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h index 21606e9..37d4d11 100644 --- a/qemu/qemu-kvm.h +++ b/qemu/qemu-kvm.h @@ -75,6 +75,8 @@ int handle_tpr_access(void *opaque, int vcpu, void kvm_tpr_vcpu_start(CPUState *env); int qemu_kvm_get_dirty_pages(unsigned long phys_addr, void *buf); +int qemu_kvm_set_mmio(int is_delayed, + target_phys_addr_t addr, unsigned int size); void qemu_kvm_system_reset_request(void); -- 1.5.2.4 |