Gerd Hoffmann wrote: > Signed-off-by: Gerd Hoffmann <kraxel@xxxxxxxxxx> > Looks basically OK, but some comments. > --- > arch/x86/Kconfig | 4 + > arch/x86/kernel/Makefile | 1 + > arch/x86/kernel/pvclock.c | 148 +++++++++++++++++++++++++++++++++++++++++++++ > include/asm-x86/pvclock.h | 6 ++ > 4 files changed, 159 insertions(+), 0 deletions(-) > create mode 100644 arch/x86/kernel/pvclock.c > create mode 100644 include/asm-x86/pvclock.h > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index fe361ae..deb3049 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -417,6 +417,10 @@ config PARAVIRT > over full virtualization. However, when run without a hypervisor > the kernel is theoretically slower and slightly larger. > > +config PARAVIRT_CLOCK > + bool > + default n > + > endif > > config MEMTEST_BOOTPARAM > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile > index 5e618c3..77807d4 100644 > --- a/arch/x86/kernel/Makefile > +++ b/arch/x86/kernel/Makefile > @@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o > obj-$(CONFIG_KVM_GUEST) += kvm.o > obj-$(CONFIG_KVM_CLOCK) += kvmclock.o > obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o > +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o > > obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o > > diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c > new file mode 100644 > index 0000000..33e526f > --- /dev/null > +++ b/arch/x86/kernel/pvclock.c > @@ -0,0 +1,148 @@ > +/* paravirtual clock -- common code used by kvm/xen > + > + This program is free software; you can redistribute it and/or modify > + it under the terms of the GNU General Public License as published by > + the Free Software Foundation; either version 2 of the License, or > + (at your option) any later version. > + > + This program is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + GNU General Public License for more details. > + > + You should have received a copy of the GNU General Public License > + along with this program; if not, write to the Free Software > + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA > +*/ > + > +#include <linux/kernel.h> > +#include <linux/percpu.h> > +#include <asm/pvclock.h> > + > +/* > + * These are perodically updated > + * xen: magic shared_info page > + * kvm: gpa registered via msr > + * and then copied here. > + */ > +struct pvclock_shadow_time { > + u64 tsc_timestamp; /* TSC at last update of time vals. */ > + u64 system_timestamp; /* Time, in nanosecs, since boot. */ > + u32 tsc_to_nsec_mul; > + int tsc_shift; > + u32 version; > +}; > + > +/* > + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, > + * yielding a 64-bit result. > + */ > +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) > +{ > + u64 product; > +#ifdef __i386__ > + u32 tmp1, tmp2; > +#endif > + > + if (shift < 0) > + delta >>= -shift; > + else > + delta <<= shift; > + > +#ifdef __i386__ > + __asm__ ( > + "mul %5 ; " > + "mov %4,%%eax ; " > + "mov %%edx,%4 ; " > + "mul %5 ; " > + "xor %5,%5 ; " > + "add %4,%%eax ; " > + "adc %5,%%edx ; " > + : "=A" (product), "=r" (tmp1), "=r" (tmp2) > + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); > +#elif __x86_64__ > + __asm__ ( > + "mul %%rdx ; shrd $32,%%rdx,%%rax" > + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); > +#else > +#error implement me! > +#endif > + > + return product; > +} > + > +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) > +{ > + u64 delta = native_read_tsc() - shadow->tsc_timestamp; > + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); > +} > + > +/* > + * Reads a consistent set of time-base values from hypervisor, > + * into a shadow data area. > + */ > +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, > + struct kvm_vcpu_time_info *src) > I think the kvm_* types should be renamed. xen_* would make some sense since the ABI originated with Xen, but something generic would be OK too. The important thing to get across is that the structure defines a guest<->host ABI which happens to be shared by Xen and KVM, and it isn't an in-kernel API like the rest of paravirt_ops. And having defined a common structure, we may as well use it in the hypervisor-specific code/headers so there's no strange casting going on. > +{ > + do { > + dst->version = src->version; > + rmb(); /* fetch version before data */ > + dst->tsc_timestamp = src->tsc_timestamp; > + dst->system_timestamp = src->system_time; > + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; > + dst->tsc_shift = src->tsc_shift; > + rmb(); /* test version after fetching data */ > + } while ((src->version & 1) || (dst->version != src->version)); > + > + return dst->version; > +} > + > +/* > + * This is our read_clock function. The host puts an tsc timestamp each time > + * it updates a new time. Without the tsc adjustment, we can have a situation > + * in which a vcpu starts to run earlier (smaller system_time), but probes > + * time later (compared to another vcpu), leading to backwards time > + */ > This comment is confusing. Are you explaining why the tsc is read within the loop? I think it can be clarified. > + > +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src) > +{ > + struct pvclock_shadow_time shadow; > + unsigned version; > + cycle_t ret, offset; > + > + do { > + version = pvclock_get_time_values(&shadow, src); > + barrier(); > + offset = pvclock_get_nsec_offset(&shadow); > + ret = shadow.system_timestamp + offset; > + barrier(); > + } while (version != src->version); > + > + return ret; > +} > + > +void pvclock_read_wallclock(struct kvm_wall_clock *wall_clock, > + struct kvm_vcpu_time_info *vcpu_time, > Ditto type names. > + struct timespec *ts) > +{ > + u32 version; > + u64 delta; > + struct timespec now; > + > + /* get wallclock at system boot */ > + do { > + version = wall_clock->wc_version; > + rmb(); /* fetch version before time */ > + now.tv_sec = wall_clock->wc_sec; > + now.tv_nsec = wall_clock->wc_nsec; > + rmb(); /* fetch time before checking version */ > + } while ((wall_clock->wc_version & 1) || (version != wall_clock->wc_version)); > + > + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ > + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; > + > + now.tv_nsec = do_div(delta, NSEC_PER_SEC); > + now.tv_sec = delta; > + > + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); > +} > diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h > new file mode 100644 > index 0000000..2b9812f > --- /dev/null > +++ b/include/asm-x86/pvclock.h > @@ -0,0 +1,6 @@ > +#include <linux/clocksource.h> > +#include <asm/kvm_para.h> > +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src); > +void pvclock_read_wallclock(struct kvm_wall_clock *wall, > + struct kvm_vcpu_time_info *vcpu, > + struct timespec *ts); > No #ifdef guards? J _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/virtualization