Thread: [kvm-devel] [patch 0/2] fix in-kernel timer / IRQ injection races

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

-- 




Timers that fire between guest hlt and vcpu_block's add_wait_queue() are 
ignored, possibly resulting in hangs.

Also make sure that atomic_inc and waitqueue_active tests happen in the
specified order, otherwise the following race is open:

CPU0                                        CPU1
                                            if (waitqueue_active(wq))
add_wait_queue()                        
if (!atomic_read(pit_timer->pending))
    schedule()
                                            atomic_inc(pit_timer->pending)

Which is not an issue for the APIC timer due to migration logic.

Signed-off-by: Marcelo Tosatti <mto...@re...>

Index: kvm/arch/x86/kvm/i8254.c
===================================================================
--- kvm.orig/arch/x86/kvm/i8254.c
+++ kvm/arch/x86/kvm/i8254.c
@@ -199,6 +199,7 @@ int __pit_timer_fn(struct kvm_kpit_state
 	struct kvm_kpit_timer *pt = &ps->pit_timer;
 
 	atomic_inc(&pt->pending);
+	smp_mb__after_atomic_inc();
 	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
 		vcpu0->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
 		wake_up_interruptible(&vcpu0->wq);
@@ -210,6 +211,16 @@ int __pit_timer_fn(struct kvm_kpit_state
 	return (pt->period == 0 ? 0 : 1);
 }
 
+int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+
+	if (pit && vcpu->vcpu_id == 0)
+		return atomic_read(&pit->pit_state.pit_timer.pending);
+
+	return 0;
+}
+
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
 	struct kvm_kpit_state *ps;
Index: kvm/arch/x86/kvm/irq.c
===================================================================
--- kvm.orig/arch/x86/kvm/irq.c
+++ kvm/arch/x86/kvm/irq.c
@@ -26,6 +26,21 @@
 #include "i8254.h"
 
 /*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	ret = pit_has_pending_timer(vcpu);
+	ret |= apic_has_pending_timer(vcpu);
+
+	return ret;
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
+/*
  * check if there is pending interrupt without
  * intack.
  */
Index: kvm/arch/x86/kvm/irq.h
===================================================================
--- kvm.orig/arch/x86/kvm/irq.h
+++ kvm/arch/x86/kvm/irq.h
@@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struc
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 
+int pit_has_pending_timer(struct kvm_vcpu *vcpu);
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
 #endif
Index: kvm/arch/x86/kvm/lapic.c
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -952,6 +952,16 @@ static int __apic_timer_fn(struct kvm_la
 	return result;
 }
 
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *lapic = vcpu->arch.apic;
+
+	if (lapic)
+		return atomic_read(&lapic->timer.pending);
+
+	return 0;
+}
+
 static int __inject_apic_timer_irq(struct kvm_lapic *apic)
 {
 	int vector;
Index: kvm/include/linux/kvm_host.h
===================================================================
--- kvm.orig/include/linux/kvm_host.h
+++ kvm/include/linux/kvm_host.h
@@ -272,6 +272,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm
 
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
 static inline void kvm_guest_enter(void)
Index: kvm/virt/kvm/kvm_main.c
===================================================================
--- kvm.orig/virt/kvm/kvm_main.c
+++ kvm/virt/kvm/kvm_main.c
@@ -752,6 +752,7 @@ void mark_page_dirty(struct kvm *kvm, gf
 	}
 }
 
+#ifdef CONFIG_X86
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
@@ -765,6 +766,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp
 	 * We will block until either an interrupt or a signal wakes us up
 	 */
 	while (!kvm_cpu_has_interrupt(vcpu)
+	       && !kvm_cpu_has_pending_timer(vcpu)
 	       && !signal_pending(current)
 	       && !kvm_arch_vcpu_runnable(vcpu)) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -776,6 +778,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&vcpu->wq, &wait);
 }
+#endif
 
 void kvm_resched(struct kvm_vcpu *vcpu)
 {

-- 




There is a window open between testing of pending IRQ's 
and assignment of guest_mode in __vcpu_run.

Injection of IRQ's can race with __vcpu_run as follows:

CPU0                                CPU1
kvm_x86_ops->run()
vcpu->guest_mode = 0                SET_IRQ_LINE ioctl
..
kvm_x86_ops->inject_pending_irq     
kvm_cpu_has_interrupt()

                                    apic_test_and_set_irr()
                                    kvm_vcpu_kick
                                    if (vcpu->guest_mode)
                                        send_ipi()
                                    
vcpu->guest_mode = 1

So move guest_mode=1 assignment before ->inject_pending_irq, and make
sure that it won't reorder after it.

Signed-off-by: Marcelo Tosatti <mto...@re...>

Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -2777,6 +2777,13 @@ again:
 		goto out;
 	}
 
+	vcpu->guest_mode = 1;
+	/*
+	 * Make sure that guest_mode assignment won't happen after
+	 * testing the pending IRQ vector bitmap.
+	 */
+	smp_wmb();
+
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else if (irqchip_in_kernel(vcpu->kvm))
@@ -2788,7 +2795,6 @@ again:
 
 	up_read(&vcpu->kvm->slots_lock);
 
-	vcpu->guest_mode = 1;
 	kvm_guest_enter();
 
 	if (vcpu->requests)
@@ -3944,11 +3950,12 @@ static void vcpu_kick_intr(void *info)
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int ipi_pcpu = vcpu->cpu;
+	int cpu = smp_processor_id();
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		++vcpu->stat.halt_wakeup;
 	}
-	if (vcpu->guest_mode)
+	if (vcpu->guest_mode && vcpu->cpu != cpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
 }

-- 




Marcelo Tosatti wrote:
> Timers that fire between guest hlt and vcpu_block's add_wait_queue() are 
> ignored, possibly resulting in hangs.
>
> Also make sure that atomic_inc and waitqueue_active tests happen in the
> specified order, otherwise the following race is open:
>
> CPU0                                        CPU1
>                                             if (waitqueue_active(wq))
> add_wait_queue()                        
> if (!atomic_read(pit_timer->pending))
>     schedule()
>                                             atomic_inc(pit_timer->pending)
>
> Which is not an issue for the APIC timer due to migration logic.
>
>   

Nasty.  I hope we can get Dor's interrupt injection notification 
working, so we don't have to handle these bugs.

> Index: kvm/virt/kvm/kvm_main.c
> ===================================================================
> --- kvm.orig/virt/kvm/kvm_main.c
> +++ kvm/virt/kvm/kvm_main.c
> @@ -752,6 +752,7 @@ void mark_page_dirty(struct kvm *kvm, gf
>  	}
>  }
>  
> +#ifdef CONFIG_X86
>  /*
>   * The vCPU has executed a HLT instruction with in-kernel mode enabled.
>   */
> @@ -765,6 +766,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp
>   


This breaks ia64 (and shouldn't s390 use this too?)
>  	 * We will block until either an interrupt or a signal wakes us up
>  	 */
>  	while (!kvm_cpu_has_interrupt(vcpu)
> +	       && !kvm_cpu_has_pending_timer(vcpu)
>   

I guess the fix is to stub this out for the other archs.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.




Marcelo Tosatti wrote:
> There is a window open between testing of pending IRQ's 
> and assignment of guest_mode in __vcpu_run.
>
> Injection of IRQ's can race with __vcpu_run as follows:
>
> CPU0                                CPU1
> kvm_x86_ops->run()
> vcpu->guest_mode = 0                SET_IRQ_LINE ioctl
> ..
> kvm_x86_ops->inject_pending_irq     
> kvm_cpu_has_interrupt()
>
>                                     apic_test_and_set_irr()
>                                     kvm_vcpu_kick
>                                     if (vcpu->guest_mode)
>                                         send_ipi()
>                                     
> vcpu->guest_mode = 1
>
> So move guest_mode=1 assignment before ->inject_pending_irq, and make
> sure that it won't reorder after it.
>
> @@ -3944,11 +3950,12 @@ static void vcpu_kick_intr(void *info)
>  void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
>  {
>  	int ipi_pcpu = vcpu->cpu;
> +	int cpu = smp_processor_id();
>  
>  	if (waitqueue_active(&vcpu->wq)) {
>  		wake_up_interruptible(&vcpu->wq);
>  		++vcpu->stat.halt_wakeup;
>  	}
> -	if (vcpu->guest_mode)
> +	if (vcpu->guest_mode && vcpu->cpu != cpu)
>  		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
>  }
>
>   

kvm_vcpu_kick() can be called from nonatomic contexts, so the vcpu->cpu 
== cpu check is dangerous (and will warn on preemptible kernels, no?)

-- 
Any sufficiently difficult bug is indistinguishable from a feature.




On Fri, Apr 11, 2008 at 03:12:41PM +0300, Avi Kivity wrote:

> This breaks ia64 (and shouldn't s390 use this too?)
> > 	 * We will block until either an interrupt or a signal wakes us up
> > 	 */
> > 	while (!kvm_cpu_has_interrupt(vcpu)
> >+	       && !kvm_cpu_has_pending_timer(vcpu)
> >  
> 
> I guess the fix is to stub this out for the other archs.

Agreed. How's this.

-----------

KVM: hlt emulation should take in-kernel APIC/PIT timers into account

Timers that fire between guest hlt and vcpu_block's add_wait_queue() are
ignored, possibly resulting in hangs.

Also make sure that atomic_inc and waitqueue_active tests happen in the
specified order, otherwise the following race is open:

CPU0                                        CPU1
                                            if (waitqueue_active(wq))
add_wait_queue()                        
if (!atomic_read(pit_timer->pending))
    schedule()
                                            atomic_inc(pit_timer->pending)

Signed-off-by: Marcelo Tosatti <mto...@re...>


Index: kvm/arch/ia64/kvm/kvm-ia64.c
===================================================================
--- kvm.orig/arch/ia64/kvm/kvm-ia64.c
+++ kvm/arch/ia64/kvm/kvm-ia64.c
@@ -1778,6 +1778,11 @@ int kvm_cpu_has_interrupt(struct kvm_vcp
 	return 0;
 }
 
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn;
Index: kvm/arch/s390/kvm/interrupt.c
===================================================================
--- kvm.orig/arch/s390/kvm/interrupt.c
+++ kvm/arch/s390/kvm/interrupt.c
@@ -321,6 +321,11 @@ int kvm_cpu_has_interrupt(struct kvm_vcp
 	return rc;
 }
 
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 {
 	u64 now, sltime;
Index: kvm/arch/x86/kvm/i8254.c
===================================================================
--- kvm.orig/arch/x86/kvm/i8254.c
+++ kvm/arch/x86/kvm/i8254.c
@@ -199,6 +199,7 @@ int __pit_timer_fn(struct kvm_kpit_state
 	struct kvm_kpit_timer *pt = &ps->pit_timer;
 
 	atomic_inc(&pt->pending);
+	smp_mb__after_atomic_inc();
 	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
 		vcpu0->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
 		wake_up_interruptible(&vcpu0->wq);
@@ -210,6 +211,16 @@ int __pit_timer_fn(struct kvm_kpit_state
 	return (pt->period == 0 ? 0 : 1);
 }
 
+int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+
+	if (pit && vcpu->vcpu_id == 0)
+		return atomic_read(&pit->pit_state.pit_timer.pending);
+
+	return 0;
+}
+
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
 	struct kvm_kpit_state *ps;
Index: kvm/arch/x86/kvm/irq.c
===================================================================
--- kvm.orig/arch/x86/kvm/irq.c
+++ kvm/arch/x86/kvm/irq.c
@@ -26,6 +26,21 @@
 #include "i8254.h"
 
 /*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	ret = pit_has_pending_timer(vcpu);
+	ret |= apic_has_pending_timer(vcpu);
+
+	return ret;
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
+/*
  * check if there is pending interrupt without
  * intack.
  */
Index: kvm/arch/x86/kvm/irq.h
===================================================================
--- kvm.orig/arch/x86/kvm/irq.h
+++ kvm/arch/x86/kvm/irq.h
@@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struc
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 
+int pit_has_pending_timer(struct kvm_vcpu *vcpu);
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
 #endif
Index: kvm/arch/x86/kvm/lapic.c
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -952,6 +952,16 @@ static int __apic_timer_fn(struct kvm_la
 	return result;
 }
 
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *lapic = vcpu->arch.apic;
+
+	if (lapic)
+		return atomic_read(&lapic->timer.pending);
+
+	return 0;
+}
+
 static int __inject_apic_timer_irq(struct kvm_lapic *apic)
 {
 	int vector;
Index: kvm/include/linux/kvm_host.h
===================================================================
--- kvm.orig/include/linux/kvm_host.h
+++ kvm/include/linux/kvm_host.h
@@ -272,6 +272,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm
 
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
 static inline void kvm_guest_enter(void)
Index: kvm/virt/kvm/kvm_main.c
===================================================================
--- kvm.orig/virt/kvm/kvm_main.c
+++ kvm/virt/kvm/kvm_main.c
@@ -765,6 +765,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp
 	 * We will block until either an interrupt or a signal wakes us up
 	 */
 	while (!kvm_cpu_has_interrupt(vcpu)
+	       && !kvm_cpu_has_pending_timer(vcpu)
 	       && !signal_pending(current)
 	       && !kvm_arch_vcpu_runnable(vcpu)) {
 		set_current_state(TASK_INTERRUPTIBLE);



On Fri, Apr 11, 2008 at 03:18:19PM +0300, Avi Kivity wrote:
> kvm_vcpu_kick() can be called from nonatomic contexts, so the vcpu->cpu 
> == cpu check is dangerous (and will warn on preemptible kernels, no?)

Doh, right. How's this.

-----------

KVM: fix kvm_vcpu_kick vs __vcpu_run race

There is a window open between testing of pending IRQ's 
and assignment of guest_mode in __vcpu_run.

Injection of IRQ's can race with __vcpu_run as follows:

CPU0                                CPU1
kvm_x86_ops->run()
vcpu->guest_mode = 0                SET_IRQ_LINE ioctl
..
kvm_x86_ops->inject_pending_irq     
kvm_cpu_has_interrupt()

                                    apic_test_and_set_irr()
                                    kvm_vcpu_kick
                                    if (vcpu->guest_mode)
                                        send_ipi()
                                    
vcpu->guest_mode = 1

So move guest_mode=1 assignment before ->inject_pending_irq, and make
sure that it won't reorder after it.


Signed-off-by: Marcelo Tosatti <mto...@re...>


Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -2777,6 +2777,13 @@ again:
 		goto out;
 	}
 
+	vcpu->guest_mode = 1;
+	/*
+	 * Make sure that guest_mode assignment won't happen after
+	 * testing the pending IRQ vector bitmap.
+	 */
+	smp_wmb();
+
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else if (irqchip_in_kernel(vcpu->kvm))
@@ -2788,7 +2795,6 @@ again:
 
 	up_read(&vcpu->kvm->slots_lock);
 
-	vcpu->guest_mode = 1;
 	kvm_guest_enter();
 
 	if (vcpu->requests)
@@ -3944,11 +3950,13 @@ static void vcpu_kick_intr(void *info)
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int ipi_pcpu = vcpu->cpu;
+	int cpu = get_cpu();
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		++vcpu->stat.halt_wakeup;
 	}
-	if (vcpu->guest_mode)
+	if (vcpu->guest_mode && vcpu->cpu != cpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+	put_cpu();
 }



Avi Kivity wrote:
>> @@ -765,6 +766,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp
>>   
> 
> 
> This breaks ia64 (and shouldn't s390 use this too?)
>>       * We will block until either an interrupt or a signal wakes us up
>>       */
>>      while (!kvm_cpu_has_interrupt(vcpu)
>> +           && !kvm_cpu_has_pending_timer(vcpu)
>>   
> 
> I guess the fix is to stub this out for the other archs.

We don't use that, we have our own implementation of vcpu_block.



Marcelo Tosatti wrote:
> On Fri, Apr 11, 2008 at 03:12:41PM +0300, Avi Kivity wrote:
>
>   
>> This breaks ia64 (and shouldn't s390 use this too?)
>>     
>>> 	 * We will block until either an interrupt or a signal wakes us up
>>> 	 */
>>> 	while (!kvm_cpu_has_interrupt(vcpu)
>>> +	       && !kvm_cpu_has_pending_timer(vcpu)
>>>  
>>>       
>> I guess the fix is to stub this out for the other archs.
>>     
>
> Agreed. How's this.
>
>   

Better :); applied.

-- 
error compiling committee.c: too many arguments to function




Carsten Otte wrote:
> Avi Kivity wrote:
>>> @@ -765,6 +766,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcp
>>>   
>>
>>
>> This breaks ia64 (and shouldn't s390 use this too?)
>>>       * We will block until either an interrupt or a signal wakes us up
>>>       */
>>>      while (!kvm_cpu_has_interrupt(vcpu)
>>> +           && !kvm_cpu_has_pending_timer(vcpu)
>>>   
>>
>> I guess the fix is to stub this out for the other archs.
>
> We don't use that, we have our own implementation of vcpu_block.

Why?

-- 
error compiling committee.c: too many arguments to function




Avi Kivity wrote:
> Why?
This one does'nt work for us. Our arch defines various reasons why we 
would not fall asleep but do something else, and we need to check them 
while in atomic of a lock that other archs don't have before sleeping.
See kvm_s390_handle_wait in arch/s390/kvm/interrupt.c.



Marcelo Tosatti wrote:
> On Fri, Apr 11, 2008 at 03:18:19PM +0300, Avi Kivity wrote:
>   
>> kvm_vcpu_kick() can be called from nonatomic contexts, so the vcpu->cpu 
>> == cpu check is dangerous (and will warn on preemptible kernels, no?)
>>     
>
> Doh, right. How's this.
>
> -----------
>
> KVM: fix kvm_vcpu_kick vs __vcpu_run race
>
> There is a window open between testing of pending IRQ's 
> and assignment of guest_mode in __vcpu_run.
>
> Injection of IRQ's can race with __vcpu_run as follows:
>
> CPU0                                CPU1
> kvm_x86_ops->run()
> vcpu->guest_mode = 0                SET_IRQ_LINE ioctl
> ..
> kvm_x86_ops->inject_pending_irq     
> kvm_cpu_has_interrupt()
>
>                                     apic_test_and_set_irr()
>                                     kvm_vcpu_kick
>                                     if (vcpu->guest_mode)
>                                         send_ipi()
>                                     
> vcpu->guest_mode = 1
>
> So move guest_mode=1 assignment before ->inject_pending_irq, and make
> sure that it won't reorder after it.
>
>
>   
Applied, but this

> @@ -3944,11 +3950,13 @@ static void vcpu_kick_intr(void *info)
>  void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
>  {
>  	int ipi_pcpu = vcpu->cpu;
> +	int cpu = get_cpu();
>  
>  	if (waitqueue_active(&vcpu->wq)) {
>  		wake_up_interruptible(&vcpu->wq);
>  		++vcpu->stat.halt_wakeup;
>  	}
> -	if (vcpu->guest_mode)
> +	if (vcpu->guest_mode && vcpu->cpu != cpu)
>  		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
> +	put_cpu();
>  }
>   

Looks like a no-op now, as vcpu_kick_intr() does nothing and 
smp_call_function_single() won't force an exit if vcpu->cpu == cpu, so I 
dropped this hunk.

-- 
error compiling committee.c: too many arguments to function




Avi Kivity wrote:
>
>> @@ -3944,11 +3950,13 @@ static void vcpu_kick_intr(void *info)
>>  void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
>>  {
>>      int ipi_pcpu = vcpu->cpu;
>> +    int cpu = get_cpu();
>>  
>>      if (waitqueue_active(&vcpu->wq)) {
>>          wake_up_interruptible(&vcpu->wq);
>>          ++vcpu->stat.halt_wakeup;
>>      }
>> -    if (vcpu->guest_mode)
>> +    if (vcpu->guest_mode && vcpu->cpu != cpu)
>>          smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
>> +    put_cpu();
>>  }
>>   
>
> Looks like a no-op now, as vcpu_kick_intr() does nothing and 
> smp_call_function_single() won't force an exit if vcpu->cpu == cpu, so 
> I dropped this hunk.
>

Oh, I see the reason now: the irq stuff now happens with irqs disabled 
which annoys smp_call_function_single().  Well, I'd like to avoid all 
this irq-disabled processing, so I'm looking at an alternate fix using a 
new KVM_REQ_EVAL_IRQ.

-- 
error compiling committee.c: too many arguments to function




Avi Kivity wrote:
> Avi Kivity wrote:
>>
>>> @@ -3944,11 +3950,13 @@ static void vcpu_kick_intr(void *info)
>>>  void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
>>>  {
>>>      int ipi_pcpu = vcpu->cpu;
>>> +    int cpu = get_cpu();
>>>  
>>>      if (waitqueue_active(&vcpu->wq)) {
>>>          wake_up_interruptible(&vcpu->wq);
>>>          ++vcpu->stat.halt_wakeup;
>>>      }
>>> -    if (vcpu->guest_mode)
>>> +    if (vcpu->guest_mode && vcpu->cpu != cpu)
>>>          smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 
>>> 0);
>>> +    put_cpu();
>>>  }
>>>   
>>
>> Looks like a no-op now, as vcpu_kick_intr() does nothing and 
>> smp_call_function_single() won't force an exit if vcpu->cpu == cpu, 
>> so I dropped this hunk.
>>
>
> Oh, I see the reason now: the irq stuff now happens with irqs disabled 
> which annoys smp_call_function_single().  Well, I'd like to avoid all 
> this irq-disabled processing, so I'm looking at an alternate fix using 
> a new KVM_REQ_EVAL_IRQ.
>

!@%$#@%, we can't move guest irq processing out of the critical section 
since we can't commit to irq delivery (we may have to deliver an 
exception instead).

I'll apply your patch as is, and will look at disentangling this later.

-- 
error compiling committee.c: too many arguments to function




On Sunday 13 April 2008 17:28:22 Avi Kivity wrote:
> Marcelo Tosatti wrote:
> > On Fri, Apr 11, 2008 at 03:12:41PM +0300, Avi Kivity wrote:
> >> This breaks ia64 (and shouldn't s390 use this too?)
> >>
> >>> 	 * We will block until either an interrupt or a signal wakes us up
> >>> 	 */
> >>> 	while (!kvm_cpu_has_interrupt(vcpu)
> >>> +	       && !kvm_cpu_has_pending_timer(vcpu)
> >>
> >> I guess the fix is to stub this out for the other archs.
> >
> > Agreed. How's this.
>
> Better :); applied.

Hi, Marcelo

This patch got into trouble when OS don't use PIT/LAPIC timer and don't 
disable them. Then the pending counters would keep increasing, but the HLT 
emulation can't be executed. And this would resulted in mass a lot (above 
220,000 per second) halt_exit for the Windows XP that using RTC as the 
clocksource (and keep PIT enabled after bios did, just mask the pin) idle, 
and the cpu utilize would be about 100% of QEmu process. 

The following patch used another way to fix the issue, though not very formal.

From 4d08ef3173084a6f0b7b76a0727e04ff42b614ba Mon Sep 17 00:00:00 2001
From: Sheng Yang <she...@in...>
Date: Fri, 9 May 2008 15:36:27 +0800
Subject: [PATCH] KVM: Fix CPU utilize hit 100% when emulate HLT in some OS


Signed-off-by: Sheng Yang <she...@in...>
---
 arch/x86/kvm/i8254.c       |    2 ++
 arch/x86/kvm/lapic.c       |    2 ++
 include/asm-x86/kvm_host.h |    2 ++
 virt/kvm/kvm_main.c        |    2 +-
 4 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index fba0e4e..b2b9eb7 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -199,6 +199,7 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 	struct kvm_kpit_timer *pt = &ps->pit_timer;

 	atomic_inc(&pt->pending);
+	vcpu0->arch.timer_pending_updated = 1;
 	smp_mb__after_atomic_inc();
 	/* FIXME: handle case where the guest is in guest mode */
 	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
@@ -577,6 +578,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_kpit_state *ps;

+	kvm->vcpus[0]->arch.timer_pending_updated = 0;
 	if (vcpu && pit) {
 		ps = &pit->pit_state;

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7652f88..b919f3f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -944,6 +944,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 	wait_queue_head_t *q = &apic->vcpu->wq;

 	atomic_inc(&apic->timer.pending);
+	apic->vcpu->arch.timer_pending_updated = 1;
 	if (waitqueue_active(q)) {
 		apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		wake_up_interruptible(q);
@@ -1067,6 +1068,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;

+	vcpu->arch.timer_pending_updated = 0;
 	if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
 		atomic_read(&apic->timer.pending) > 0) {
 		if (__inject_apic_timer_irq(apic))
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 1d8cd01..5eded7b 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -285,6 +285,8 @@ struct kvm_vcpu_arch {
 	struct kvm_vcpu_time_info hv_clock;
 	unsigned int time_offset;
 	struct page *time_page;
+
+	bool timer_check_pending;
 };

 struct kvm_mem_alias {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0846d3d..ff635e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -791,7 +791,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	 * We will block until either an interrupt or a signal wakes us up
 	 */
 	while (!kvm_cpu_has_interrupt(vcpu)
-	       && !kvm_cpu_has_pending_timer(vcpu)
+	       && !vcpu->arch.timer_pending_updated
 	       && !signal_pending(current)
 	       && !kvm_arch_vcpu_runnable(vcpu)) {
 		set_current_state(TASK_INTERRUPTIBLE);
--
1.5.5


On Fri, May 09, 2008 at 03:49:20PM +0800, Yang, Sheng wrote:
> On Sunday 13 April 2008 17:28:22 Avi Kivity wrote:
> > Marcelo Tosatti wrote:
> > > On Fri, Apr 11, 2008 at 03:12:41PM +0300, Avi Kivity wrote:
> > >> This breaks ia64 (and shouldn't s390 use this too?)
> > >>
> > >>> 	 * We will block until either an interrupt or a signal wakes us up
> > >>> 	 */
> > >>> 	while (!kvm_cpu_has_interrupt(vcpu)
> > >>> +	       && !kvm_cpu_has_pending_timer(vcpu)
> > >>
> > >> I guess the fix is to stub this out for the other archs.
> > >
> > > Agreed. How's this.
> >
> > Better :); applied.
> 
> Hi, Marcelo
> 
> This patch got into trouble when OS don't use PIT/LAPIC timer and don't 
> disable them. Then the pending counters would keep increasing, but the HLT 
> emulation can't be executed. And this would resulted in mass a lot (above 
> 220,000 per second) halt_exit for the Windows XP that using RTC as the 
> clocksource (and keep PIT enabled after bios did, just mask the pin) idle, 
> and the cpu utilize would be about 100% of QEmu process. 
> 
> The following patch used another way to fix the issue, though not very formal.

Hi Sheng,

Did you have kvm.git commit 8ae6dc90ac84d9734e343210c8ec709f50cd9d89
when testing this?

I believe it should fix that issue, because "ps->inject_pending" won't
be set by kvm_pit_timer_intr_post() if the IRQ is masked. Please correct
me if I'm wrong.



On Friday 09 May 2008 22:53:00 Marcelo Tosatti wrote:
> On Fri, May 09, 2008 at 03:49:20PM +0800, Yang, Sheng wrote:
> > On Sunday 13 April 2008 17:28:22 Avi Kivity wrote:
> > > Marcelo Tosatti wrote:
> > > > On Fri, Apr 11, 2008 at 03:12:41PM +0300, Avi Kivity wrote:
> > > >> This breaks ia64 (and shouldn't s390 use this too?)
> > > >>
> > > >>> 	 * We will block until either an interrupt or a signal wakes us up
> > > >>> 	 */
> > > >>> 	while (!kvm_cpu_has_interrupt(vcpu)
> > > >>> +	       && !kvm_cpu_has_pending_timer(vcpu)
> > > >>
> > > >> I guess the fix is to stub this out for the other archs.
> > > >
> > > > Agreed. How's this.
> > >
> > > Better :); applied.
> >
> > Hi, Marcelo
> >
> > This patch got into trouble when OS don't use PIT/LAPIC timer and don't
> > disable them. Then the pending counters would keep increasing, but the
> > HLT emulation can't be executed. And this would resulted in mass a lot
> > (above 220,000 per second) halt_exit for the Windows XP that using RTC as
> > the clocksource (and keep PIT enabled after bios did, just mask the pin)
> > idle, and the cpu utilize would be about 100% of QEmu process.
> >
> > The following patch used another way to fix the issue, though not very
> > formal.
>
> Hi Sheng,
>
> Did you have kvm.git commit 8ae6dc90ac84d9734e343210c8ec709f50cd9d89
> when testing this?
>
> I believe it should fix that issue, because "ps->inject_pending" won't
> be set by kvm_pit_timer_intr_post() if the IRQ is masked. Please correct
> me if I'm wrong.

Oh, sorry, I missed that commit. But... It just solved an half of the problem. 
LAPIC suffered from it as well, and the current HLT emulation still didn't 
work... And I can't find something like inject_pending in LAPIC timer.

I have to say, I think my method is more preciously, directly and efficient... 
It also can be extended easily if we got more clock sources (though I don't 
think this would happen in near future...). In fact, I think take care of 
pending counts is some kind of *wrong concept*... We should take care of the 
window, or when the increment of pending counters happened, CMIIW. And it got 
nothing to do with the current counter number (yeah, I realized it after saw 
the hlt behaviour in XP, not before ;) ).

--
Thanks
Yang, Sheng



On Sat, May 10, 2008 at 10:12:02AM +0800, Yang, Sheng wrote:
> > Did you have kvm.git commit 8ae6dc90ac84d9734e343210c8ec709f50cd9d89
> > when testing this?
> >
> > I believe it should fix that issue, because "ps->inject_pending" won't
> > be set by kvm_pit_timer_intr_post() if the IRQ is masked. Please correct
> > me if I'm wrong.
> 
> Oh, sorry, I missed that commit. But... It just solved an half of the problem. 
> LAPIC suffered from it as well, and the current HLT emulation still didn't 
> work... And I can't find something like inject_pending in LAPIC timer.
> 
> I have to say, I think my method is more preciously, directly and efficient... 
> It also can be extended easily if we got more clock sources (though I don't 
> think this would happen in near future...). In fact, I think take care of 
> pending counts is some kind of *wrong concept*... We should take care of the 
> window, or when the increment of pending counters happened, CMIIW. And it got 
> nothing to do with the current counter number (yeah, I realized it after saw 
> the hlt behaviour in XP, not before ;) ).

Sheng,

The problem is that you don't want to emulate hlt if you have a pending
timer _and_ the guest is accepting events. So for example if there are
two apic timers pending, you inject one of them, guest execute's hlt, we
end up in vcpu_block().

Does this work for you?

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7652f88..d41e34c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -961,7 +961,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *lapic = vcpu->arch.apic;
 
-	if (lapic)
+	if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
 		return atomic_read(&lapic->timer.pending);
 
 	return 0;




On Tuesday 13 May 2008 00:40:05 Marcelo Tosatti wrote:
> On Sat, May 10, 2008 at 10:12:02AM +0800, Yang, Sheng wrote:
> > > Did you have kvm.git commit 8ae6dc90ac84d9734e343210c8ec709f50cd9d89
> > > when testing this?
> > >
> > > I believe it should fix that issue, because "ps->inject_pending" won't
> > > be set by kvm_pit_timer_intr_post() if the IRQ is masked. Please
> > > correct me if I'm wrong.
> >
> > Oh, sorry, I missed that commit. But... It just solved an half of the
> > problem. LAPIC suffered from it as well, and the current HLT emulation
> > still didn't work... And I can't find something like inject_pending in
> > LAPIC timer.
> >
> > I have to say, I think my method is more preciously, directly and
> > efficient... It also can be extended easily if we got more clock sources
> > (though I don't think this would happen in near future...). In fact, I
> > think take care of pending counts is some kind of *wrong concept*... We
> > should take care of the window, or when the increment of pending counters
> > happened, CMIIW. And it got nothing to do with the current counter number
> > (yeah, I realized it after saw the hlt behaviour in XP, not before ;) ).
>
> Sheng,
>
> The problem is that you don't want to emulate hlt if you have a pending
> timer _and_ the guest is accepting events. So for example if there are
> two apic timers pending, you inject one of them, guest execute's hlt, we
> end up in vcpu_block().
>
> Does this work for you?

Yeah. I also suggest using the consistent implement for all the 
_has_pending_timer. (in fact, if take pending counters as the interrupts 
which have to delay their injection, the explanation is well :) )

-- 
Thanks
Yang, Sheng
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 7652f88..d41e34c 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -961,7 +961,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_lapic *lapic = vcpu->arch.apic;
>
> -	if (lapic)
> +	if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
>  		return atomic_read(&lapic->timer.pending);
>
>  	return 0;