From: Gerd H. <kr...@re...> - 2008-04-24 08:37:15
|
Hi folks, My first attempt to send out a patch series with git ... The patches fix the kvm paravirt clocksource code to be compatible with xen and they also factor out some code which can be shared into a separate source files used by both kvm and xen. cheers, Gerd |
From: Gerd H. <kr...@re...> - 2008-04-24 08:37:12
|
Signed-off-by: Gerd Hoffmann <kr...@re...> --- arch/x86/kvm/x86.c | 63 +++++++++++++++++++++++++++++++++++++++++++-------- 1 files changed, 53 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ce5563..45b71c6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -493,7 +493,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; struct kvm_wall_clock wc; - struct timespec wc_ts; + struct timespec now,sys,boot; if (!wall_clock) return; @@ -502,9 +502,16 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); - wc_ts = current_kernel_time(); - wc.wc_sec = wc_ts.tv_sec; - wc.wc_nsec = wc_ts.tv_nsec; +#if 0 + /* Hmm, getboottime() isn't exported to modules ... */ + getboottime(&boot); +#else + now = current_kernel_time(); + ktime_get_ts(&sys); + boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); +#endif + wc.wc_sec = boot.tv_sec; + wc.wc_nsec = boot.tv_nsec; wc.wc_version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -537,20 +544,58 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate - * state, we just write "2" at the end + * state, we just increase by 2 at the end. */ - vcpu->hv_clock.version = 2; + vcpu->hv_clock.version += 2; shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + sizeof(vcpu->hv_clock)); kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); } +static uint32_t div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; + + __asm__ ( "divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor) ); + return quotient; +} + +static void kvm_set_time_scale(uint32_t tsc_khz, struct kvm_vcpu_time_info *hv_clock) +{ + uint64_t nsecs = 1000000000LL; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; + shift++; + } + + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + +#if 0 + printk(KERN_DEBUG "%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", + __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + hv_clock->tsc_to_system_mul); +#endif +} int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { @@ -599,9 +644,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - vcpu->arch.hv_clock.tsc_to_system_mul = - clocksource_khz2mult(tsc_khz, 22); - vcpu->arch.hv_clock.tsc_shift = 22; + kvm_set_time_scale(tsc_khz, &vcpu->arch.hv_clock); down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = -- 1.5.4.1 |
From: Gerd H. <kr...@re...> - 2008-04-24 08:37:15
|
Signed-off-by: Gerd Hoffmann <kr...@re...> --- arch/x86/Kconfig | 1 + arch/x86/kernel/kvmclock.c | 66 ++++++++++--------------------------------- 2 files changed, 17 insertions(+), 50 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe73d38..ed1a679 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -373,6 +373,7 @@ config VMI config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT + select PARAVIRT_CLOCK depends on !(X86_VISWS || X86_VOYAGER) help Turning on this option will allow you to run a paravirtualized clock diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ddee040..476b7c7 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,6 +18,7 @@ #include <linux/clocksource.h> #include <linux/kvm_para.h> +#include <asm/pvclock.h> #include <asm/arch_hooks.h> #include <asm/msr.h> #include <asm/apic.h> @@ -37,17 +38,9 @@ early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field - -static inline u64 kvm_get_delta(u64 last_tsc) -{ - int cpu = smp_processor_id(); - u64 delta = native_read_tsc() - last_tsc; - return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void); + /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -55,35 +48,19 @@ static cycle_t kvm_clock_read(void); */ unsigned long kvm_get_wallclock(void) { - u32 wc_sec, wc_nsec; - u64 delta; + struct kvm_vcpu_time_info *vcpu_time; struct timespec ts; - int version, nsec; int low, high; low = (int)__pa(&wall_clock); high = ((u64)__pa(&wall_clock) >> 32); - - delta = kvm_clock_read(); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - do { - version = wall_clock.wc_version; - rmb(); - wc_sec = wall_clock.wc_sec; - wc_nsec = wall_clock.wc_nsec; - rmb(); - } while ((wall_clock.wc_version != version) || (version & 1)); - - delta = kvm_clock_read() - delta; - delta += wc_nsec; - nsec = do_div(delta, NSEC_PER_SEC); - set_normalized_timespec(&ts, wc_sec + delta, nsec); - /* - * Of all mechanisms of time adjustment I've tested, this one - * was the champion! - */ - return ts.tv_sec + 1; + + vcpu_time = &get_cpu_var(hv_clock); + pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); + put_cpu_var(hv_clock); + + return ts.tv_sec; } int kvm_set_wallclock(unsigned long now) @@ -91,28 +68,17 @@ int kvm_set_wallclock(unsigned long now) return 0; } -/* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time - */ static cycle_t kvm_clock_read(void) { - u64 last_tsc, now; - int cpu; + struct kvm_vcpu_time_info *src; + cycle_t ret; - preempt_disable(); - cpu = smp_processor_id(); - - last_tsc = get_clock(cpu, tsc_timestamp); - now = get_clock(cpu, system_time); - - now += kvm_get_delta(last_tsc); - preempt_enable(); - - return now; + src = &get_cpu_var(hv_clock); + ret = pvclock_clocksource_read(src); + put_cpu_var(hv_clock); + return ret; } + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, -- 1.5.4.1 |
From: Glauber C. <gl...@gm...> - 2008-04-24 13:16:41
|
> - > - delta = kvm_clock_read(); > - > native_write_msr(MSR_KVM_WALL_CLOCK, low, high); > - do { > - version = wall_clock.wc_version; > - rmb(); > - wc_sec = wall_clock.wc_sec; > - wc_nsec = wall_clock.wc_nsec; > - rmb(); > - } while ((wall_clock.wc_version != version) || (version & 1)); > - > - delta = kvm_clock_read() - delta; > - delta += wc_nsec; > - nsec = do_div(delta, NSEC_PER_SEC); > - set_normalized_timespec(&ts, wc_sec + delta, nsec); > - /* > - * Of all mechanisms of time adjustment I've tested, this one > - * was the champion! > - */ > - return ts.tv_sec + 1; > + > + vcpu_time = &get_cpu_var(hv_clock); > + pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); > + put_cpu_var(hv_clock); > + > + return ts.tv_sec; > } Here it is. Despite some needed cleanups, patches look beautiful to me. -- Glauber Costa. "Free as in Freedom" http://glommer.net "The less confident you are, the more serious you have to act." |
From: Gerd H. <kr...@re...> - 2008-04-24 08:37:15
|
Signed-off-by: Gerd Hoffmann <kr...@re...> --- arch/x86/Kconfig | 4 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/pvclock.c | 146 +++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/pvclock.h | 6 ++ 4 files changed, 157 insertions(+), 0 deletions(-) create mode 100644 arch/x86/kernel/pvclock.c create mode 100644 include/asm-x86/pvclock.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a22be4a..fe73d38 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -400,6 +400,10 @@ config PARAVIRT over full virtualization. However, when run without a hypervisor the kernel is theoretically slower and slightly larger. +config PARAVIRT_CLOCK + bool + default n + endif config MEMTEST_BOOTPARAM diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fa19c38..ab7999c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o ifdef CONFIG_INPUT_PCSPKR obj-y += pcspeaker.o diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 0000000..fecf17a --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,146 @@ +/* paravirtual clock -- common code used by kvm/xen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <asm/pvclock.h> + +/* + * These are perodically updated + * xen: magic shared_info page + * kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ + u64 delta = native_read_tsc() - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, + struct kvm_vcpu_time_info *src) +{ + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) | (dst->version ^ src->version)); + + return dst->version; +} + +/* + * This is our read_clock function. The host puts an tsc timestamp each time + * it updates a new time. Without the tsc adjustment, we can have a situation + * in which a vcpu starts to run earlier (smaller system_time), but probes + * time later (compared to another vcpu), leading to backwards time + */ +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src) +{ + struct pvclock_shadow_time shadow; + unsigned version; + cycle_t ret; + + do { + version = pvclock_get_time_values(&shadow, src); + barrier(); + ret = shadow.system_timestamp + pvclock_get_nsec_offset(&shadow); + barrier(); + } while (version != src->version); + + return ret; +} + +void pvclock_read_wallclock(struct kvm_wall_clock *wall_clock, + struct kvm_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->wc_version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->wc_sec; + now.tv_nsec = wall_clock->wc_nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->wc_version & 1) || (version != wall_clock->wc_version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h new file mode 100644 index 0000000..2b9812f --- /dev/null +++ b/include/asm-x86/pvclock.h @@ -0,0 +1,6 @@ +#include <linux/clocksource.h> +#include <asm/kvm_para.h> +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src); +void pvclock_read_wallclock(struct kvm_wall_clock *wall, + struct kvm_vcpu_time_info *vcpu, + struct timespec *ts); -- 1.5.4.1 |
From: Gerd H. <kr...@re...> - 2008-04-24 08:37:12
|
Signed-off-by: Gerd Hoffmann <kr...@re...> --- arch/x86/xen/Kconfig | 1 + arch/x86/xen/time.c | 110 +++++--------------------------------------------- 2 files changed, 12 insertions(+), 99 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 4d5f264..47f0cdc 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -5,6 +5,7 @@ config XEN bool "Xen guest support" select PARAVIRT + select PARAVIRT_CLOCK depends on X86_32 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) help diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5..3d5f945 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -13,6 +13,7 @@ #include <linux/clockchips.h> #include <linux/kernel_stat.h> +#include <asm/pvclock.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -30,17 +31,6 @@ static cycle_t xen_clocksource_read(void); -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -230,95 +220,14 @@ unsigned long xen_cpu_khz(void) return xen_khz; } -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) -{ - struct vcpu_time_info *src; - struct shadow_time_info *dst; - - /* src is shared memory with the hypervisor, so we need to - make sure we get a consistent snapshot, even in the face of - being preempted. */ - src = &__get_cpu_var(xen_vcpu)->time; - dst = &__get_cpu_var(shadow_time); - - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) | (dst->version ^ src->version)); - - return dst->version; -} - -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - -static u64 get_nsec_offset(struct shadow_time_info *shadow) -{ - u64 now, delta; - now = native_read_tsc(); - delta = now - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); -} - static cycle_t xen_clocksource_read(void) { - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + struct vcpu_time_info *src; cycle_t ret; - unsigned version; - - do { - version = get_time_values_from_xen(); - barrier(); - ret = shadow->system_timestamp + get_nsec_offset(shadow); - barrier(); - } while (version != __get_cpu_var(xen_vcpu)->time.version); - - put_cpu_var(shadow_time); + src = &get_cpu_var(xen_vcpu)->time; + ret = pvclock_clocksource_read((void*)src); + put_cpu_var(xen_vcpu); return ret; } @@ -349,9 +258,14 @@ static void xen_read_wallclock(struct timespec *ts) unsigned long xen_get_wallclock(void) { + const struct shared_info *s = HYPERVISOR_shared_info; + struct kvm_wall_clock *wall_clock = (void*)&(s->wc_version); + struct vcpu_time_info *vcpu_time; struct timespec ts; - xen_read_wallclock(&ts); + vcpu_time = &get_cpu_var(xen_vcpu)->time; + pvclock_read_wallclock(wall_clock, (void*)vcpu_time, &ts); + put_cpu_var(xen_vcpu); return ts.tv_sec; } @@ -576,8 +490,6 @@ __init void xen_time_init(void) { int cpu = smp_processor_id(); - get_time_values_from_xen(); - clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { -- 1.5.4.1 |
From: Glauber C. <gl...@gm...> - 2008-04-24 13:11:00
|
> +void pvclock_read_wallclock(struct kvm_wall_clock *wall_clock, > + struct kvm_vcpu_time_info *vcpu_time, > + struct timespec *ts) > +{ > + u32 version; > + u64 delta; > + struct timespec now; > + > + /* get wallclock at system boot */ > + do { > + version = wall_clock->wc_version; > + rmb(); /* fetch version before time */ > + now.tv_sec = wall_clock->wc_sec; > + now.tv_nsec = wall_clock->wc_nsec; > + rmb(); /* fetch time before checking version */ > + } while ((wall_clock->wc_version & 1) || (version != wall_clock->wc_version)); > + > + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ > + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; > + > + now.tv_nsec = do_div(delta, NSEC_PER_SEC); > + now.tv_sec = delta; > + > + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); > +} This is not exactly what kvm does. For us, wallclock read and system time reads are decoupled operations, controlled by different msrs. This function might exist, but in this case, it have to be wrapped around a kvm_read_wallclock(), that does the msr read. (I'm not sure whether or not you do it in your later patches, doing sequential reads :-) ) > ------------------------------------------------------------------------- > This SF.net email is sponsored by the 2008 JavaOne(SM) Conference > Don't miss this year's exciting event. There's still time to save $100. > Use priority code J8TL2D2. > http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone ads! ads! -- Glauber Costa. "Free as in Freedom" http://glommer.net "The less confident you are, the more serious you have to act." |
From: Gerd H. <kr...@re...> - 2008-04-28 08:54:52
|
Glauber Costa wrote: > This is not exactly what kvm does. For us, wallclock read and system > time reads are decoupled operations, controlled by different msrs. Same for xen. Although both live in the shared_info page they are updated independently (and the wall clock is updated much less frequently). > This function might exist, but in this case, it have to be wrapped > around a kvm_read_wallclock(), that does the msr read. (I'm not sure > whether or not you do it in your later patches, doing sequential reads > :-) ) It is, as you have seen in the kvm patch ;) What is the reason to handle the two clock msrs in different ways btw? I think it would be better to have both msrs work the same way though, i.e. the wallclock msr should have a enable bit and should auto-update too. cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |
From: Avi K. <av...@qu...> - 2008-04-27 12:58:47
|
Gerd Hoffmann wrote: > Hi folks, > > My first attempt to send out a patch series with git ... > > The patches fix the kvm paravirt clocksource code to be compatible with > xen and they also factor out some code which can be shared into a > separate source files used by both kvm and xen. > > The patches look good, but pleasy copy Jeremy and virtualization@ for patches which touch things outside kvm. It's perhaps better to reverse the order: first fix kvm to be compatible, then merge the Xen and kvm implementations into a single one. -- error compiling committee.c: too many arguments to function |
From: Gerd H. <kr...@re...> - 2008-04-28 12:09:35
|
Avi Kivity wrote: > The patches look good, but pleasy copy Jeremy and virtualization@ for > patches which touch things outside kvm. Will do for the next round. > It's perhaps better to reverse the order: first fix kvm to be > compatible, then merge the Xen and kvm implementations into a single one. Fixing kvm (guest side) would be copying parts of the xen guest code, so that doesn't make much sense to me ... cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |
From: Marcelo T. <mto...@re...> - 2008-04-28 19:25:54
|
On Thu, Apr 24, 2008 at 10:37:04AM +0200, Gerd Hoffmann wrote: > Hi folks, > > My first attempt to send out a patch series with git ... > > The patches fix the kvm paravirt clocksource code to be compatible with > xen and they also factor out some code which can be shared into a > separate source files used by both kvm and xen. The issue with SMP guests is still present. Booting with "nohz=off" resolves it. Same symptoms as before, apic_timer_fn for one of the vcpu's is ticking way slower than the remaining ones: [root@localhost ~]# cat /proc/timer_stats | grep apic 391, 4125 qemu-system-x86 apic_mmio_write (apic_timer_fn) 2103, 4126 qemu-system-x86 apic_mmio_write (apic_timer_fn) 1896, 4127 qemu-system-x86 apic_mmio_write (apic_timer_fn) 1857, 4128 qemu-system-x86 apic_mmio_write (apic_timer_fn) Let me know what else is needed, or any patches to try. |
From: Gerd H. <kr...@re...> - 2008-05-05 07:48:05
|
Marcelo Tosatti wrote: > On Thu, Apr 24, 2008 at 10:37:04AM +0200, Gerd Hoffmann wrote: >> Hi folks, >> >> My first attempt to send out a patch series with git ... >> >> The patches fix the kvm paravirt clocksource code to be compatible with >> xen and they also factor out some code which can be shared into a >> separate source files used by both kvm and xen. > > The issue with SMP guests is still present. Booting with "nohz=off" resolves it. > > Same symptoms as before, apic_timer_fn for one of the vcpu's is ticking way slower > than the remaining ones: > > [root@localhost ~]# cat /proc/timer_stats | grep apic > 391, 4125 qemu-system-x86 apic_mmio_write (apic_timer_fn) > 2103, 4126 qemu-system-x86 apic_mmio_write (apic_timer_fn) > 1896, 4127 qemu-system-x86 apic_mmio_write (apic_timer_fn) > 1857, 4128 qemu-system-x86 apic_mmio_write (apic_timer_fn) What userspace version is this? With iothread support? Or older one where the vcpu0 thread also handles all the I/O? Is 4x neeed to reproduce or do you see it with 2x too? What host? A quick test with xenner (which has a separate I/O thread) didn't show anything unusual. Going investigate ... cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |
From: Marcelo T. <mto...@re...> - 2008-05-05 15:29:55
|
On Mon, May 05, 2008 at 09:47:59AM +0200, Gerd Hoffmann wrote: > Marcelo Tosatti wrote: > > On Thu, Apr 24, 2008 at 10:37:04AM +0200, Gerd Hoffmann wrote: > >> Hi folks, > >> > >> My first attempt to send out a patch series with git ... > >> > >> The patches fix the kvm paravirt clocksource code to be compatible with > >> xen and they also factor out some code which can be shared into a > >> separate source files used by both kvm and xen. > > > > The issue with SMP guests is still present. Booting with "nohz=off" resolves it. > > > > Same symptoms as before, apic_timer_fn for one of the vcpu's is ticking way slower > > than the remaining ones: > > > > [root@localhost ~]# cat /proc/timer_stats | grep apic > > 391, 4125 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > 2103, 4126 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > 1896, 4127 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > 1857, 4128 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > What userspace version is this? With iothread support? Or older one > where the vcpu0 thread also handles all the I/O? Is 4x neeed to > reproduce or do you see it with 2x too? What host? F8 host, recent kvm-userspace.git (so with IO thread), recent kvm.git (plus your patches), haven't tried 2x but I think 4x is not necessary to reproduce the problem. > A quick test with xenner (which has a separate I/O thread) didn't show > anything unusual. Going investigate ... Give a pure kvm guest a try, its pretty easy to reproduce. |
From: Gerd H. <kr...@re...> - 2008-05-06 16:00:05
|
Marcelo Tosatti wrote: > F8 host, recent kvm-userspace.git (so with IO thread), recent kvm.git > (plus your patches), haven't tried 2x but I think 4x is not necessary to > reproduce the problem. Ok, see it too. Seem to be actually two (maybe related) problems. First the guest hangs hard after a while, burning 100% CPU time (deadlocked I guess), doesn't respond to sysrq any more. Is there some easy way to get the guest vcpu state then? EIP for starters, preferably with stack trace? The other one is that one ticks slower than the other. I don't see it from start, but after a while it starts happening (unless the guest deadlocks before ...). cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |
From: Gerd H. <kr...@re...> - 2008-05-06 16:44:56
|
Gerd Hoffmann wrote: > Marcelo Tosatti wrote: >> F8 host, recent kvm-userspace.git (so with IO thread), recent kvm.git >> (plus your patches), haven't tried 2x but I think 4x is not necessary to >> reproduce the problem. > > Ok, see it too. Seem to be actually two (maybe related) problems. > > First the guest hangs hard after a while, burning 100% CPU time > (deadlocked I guess), doesn't respond to sysrq any more. Is there some > easy way to get the guest vcpu state then? Hmm, "info registers" in qemu monitor hangs ... cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |
From: Gerd H. <kr...@re...> - 2008-05-07 18:45:24
Attachments:
fix
|
Marcelo Tosatti wrote: > On Thu, Apr 24, 2008 at 10:37:04AM +0200, Gerd Hoffmann wrote: >> Hi folks, >> >> My first attempt to send out a patch series with git ... >> >> The patches fix the kvm paravirt clocksource code to be compatible with >> xen and they also factor out some code which can be shared into a >> separate source files used by both kvm and xen. > > The issue with SMP guests is still present. Booting with "nohz=off" resolves it. > > Same symptoms as before, apic_timer_fn for one of the vcpu's is ticking way slower > than the remaining ones: > > [root@localhost ~]# cat /proc/timer_stats | grep apic > 391, 4125 qemu-system-x86 apic_mmio_write (apic_timer_fn) > 2103, 4126 qemu-system-x86 apic_mmio_write (apic_timer_fn) > 1896, 4127 qemu-system-x86 apic_mmio_write (apic_timer_fn) > 1857, 4128 qemu-system-x86 apic_mmio_write (apic_timer_fn) > > Let me know what else is needed, or any patches to try. Ok folks, here is the band aid fix for testing from the odd bugs department. Goes on top of the four patches of this series. A real, clean solution is TBD. Tomorrow I hope (some urgent private problems are in the queue too ...). Problem is the per-cpu area for cpu 0 has two locations in memory, one before and one after pda initialization. kvmclock registers the first due to being initialized quite early, and the paravirt clock for cpu 0 stops seeing updates once the pda setup is done. Which makes the TSC effectively the base for timekeeping (instead of using the TSC for millisecond delta adjustments only). Secondary CPUs work as intended. This obviously screws up timekeeping on SMP guests, especially on hosts with unstable TSC. happy testing, Gerd -- [root@localhost ~]# dmesg | grep _clock kvm_register_clock: cpu 0 at 0:798601 (boot) kvm_clock_read: cpu 0 at 0:140b601 (pda) kvm_register_clock: cpu 1 at 0:1415601 |
From: Marcelo T. <mto...@re...> - 2008-05-08 23:07:30
|
On Wed, May 07, 2008 at 08:45:12PM +0200, Gerd Hoffmann wrote: > Ok folks, here is the band aid fix for testing from the odd bugs > department. Goes on top of the four patches of this series. A real, > clean solution is TBD. Tomorrow I hope (some urgent private problems > are in the queue too ...). > > Problem is the per-cpu area for cpu 0 has two locations in memory, one > before and one after pda initialization. kvmclock registers the first > due to being initialized quite early, and the paravirt clock for cpu 0 > stops seeing updates once the pda setup is done. Which makes the TSC > effectively the base for timekeeping (instead of using the TSC for > millisecond delta adjustments only). Secondary CPUs work as intended. > > This obviously screws up timekeeping on SMP guests, especially on hosts > with unstable TSC. > > happy testing, Gerd, SMP guests can boot and seem stable. Thanks! |
From: Glauber C. <gc...@re...> - 2008-05-09 13:36:45
|
Marcelo Tosatti wrote: > On Wed, May 07, 2008 at 08:45:12PM +0200, Gerd Hoffmann wrote: >> Ok folks, here is the band aid fix for testing from the odd bugs >> department. Goes on top of the four patches of this series. A real, >> clean solution is TBD. Tomorrow I hope (some urgent private problems >> are in the queue too ...). >> >> Problem is the per-cpu area for cpu 0 has two locations in memory, one >> before and one after pda initialization. kvmclock registers the first >> due to being initialized quite early, and the paravirt clock for cpu 0 >> stops seeing updates once the pda setup is done. Which makes the TSC >> effectively the base for timekeeping (instead of using the TSC for >> millisecond delta adjustments only). Secondary CPUs work as intended. >> >> This obviously screws up timekeeping on SMP guests, especially on hosts >> with unstable TSC. >> >> happy testing, > > Gerd, > > SMP guests can boot and seem stable. Thanks! > There are also other cases that suffer from the same problem: living in a per-cpu area before setup_per_cpu_area is done, and thus, having to update the addresses later on. arch/x86/kernel/setup.c has an example of such code, the apicid mappings. With one more user of it arriving, as a result of something that turned out to be a hard to find issue, I'm starting to get the feeling that we can do better than this, on the overall. Looking quickly at the code, it seems to me that moving the per-cpu initialization earlier is not possible, or at least, not trivially possible. So maybe declare the per-cpu areas in a special section, then in setup_per_cpu_areas, copy them into the definitive per-cpu section and update the callers? What do you think? |
From: Gerd H. <kr...@re...> - 2008-05-13 06:54:05
|
Glauber Costa wrote: > So maybe declare the per-cpu areas in a special section, then in > setup_per_cpu_areas, copy them into the definitive per-cpu section and > update the callers? The special section and the copy is implemented already. That doesn't cut it for the kvmclock case though. We registered the physical address via msr write in the host, and *that* needs an update too. Otherwise the host continues to update the pre-setup location, and the guest sees the (stale) values the kvm clock had at per-cpu-area-setup time (when the copy took place). cheers, Gerd -- http://kraxel.fedorapeople.org/xenner/ |