On 10/23/2011 12:07 PM, Raghavendra K T wrote: > This patch extends Linux guests running on KVM hypervisor to support > pv-ticketlocks. Very early during bootup, paravirtualied KVM guest detects if > the hypervisor has required feature (KVM_FEATURE_WAIT_FOR_KICK) to support > pv-ticketlocks. If so, support for pv-ticketlocks is registered via pv_lock_ops. > > Signed-off-by: Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Suzuki Poulose <suzuki@xxxxxxxxxx> > Signed-off-by: Raghavendra K T <raghavendra.kt@xxxxxxxxxxxxxxxxxx> > --- > diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h > index 2874c19..c7f34b7 100644 > --- a/arch/x86/include/asm/kvm_para.h > +++ b/arch/x86/include/asm/kvm_para.h > @@ -195,10 +195,18 @@ void kvm_async_pf_task_wait(u32 token); > void kvm_async_pf_task_wake(u32 token); > u32 kvm_read_and_reset_pf_reason(void); > extern void kvm_disable_steal_time(void); > -#else > + > +#ifdef CONFIG_PARAVIRT_SPINLOCKS > +void __init kvm_guest_early_init(void); > +#else /* CONFIG_PARAVIRT_SPINLOCKS */ > +#define kvm_guest_early_init() do { } while (0) > +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ > + > +#else /* CONFIG_KVM_GUEST */ > #define kvm_guest_init() do { } while (0) > #define kvm_async_pf_task_wait(T) do {} while(0) > #define kvm_async_pf_task_wake(T) do {} while(0) > +#define kvm_guest_early_init() do { } while (0) > static inline u32 kvm_read_and_reset_pf_reason(void) > { > return 0; > diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c > index 3bb0850..fb25bca 100644 > --- a/arch/x86/kernel/head32.c > +++ b/arch/x86/kernel/head32.c > @@ -9,6 +9,7 @@ > #include <linux/start_kernel.h> > #include <linux/mm.h> > #include <linux/memblock.h> > +#include <linux/kvm_para.h> > > #include <asm/setup.h> > #include <asm/sections.h> > @@ -59,6 +60,8 @@ void __init i386_start_kernel(void) > break; > } > > + kvm_guest_early_init(); > + > /* > * At this point everything still needed from the boot loader > * or BIOS or kernel text should be early reserved or marked not > diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c > index 5655c22..cabf8ec 100644 > --- a/arch/x86/kernel/head64.c > +++ b/arch/x86/kernel/head64.c > @@ -13,6 +13,7 @@ > #include <linux/start_kernel.h> > #include <linux/io.h> > #include <linux/memblock.h> > +#include <linux/kvm_para.h> > > #include <asm/processor.h> > #include <asm/proto.h> > @@ -115,6 +116,8 @@ void __init x86_64_start_reservations(char *real_mode_data) > > reserve_ebda_region(); > > + kvm_guest_early_init(); > + > /* > * At this point everything still needed from the boot loader > * or BIOS or kernel text should be early reserved or marked not > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > index a9c2116..f4f341f 100644 > --- a/arch/x86/kernel/kvm.c > +++ b/arch/x86/kernel/kvm.c > @@ -39,6 +39,16 @@ > #include <asm/desc.h> > #include <asm/tlbflush.h> > > +#ifdef CONFIG_PARAVIRT_SPINLOCKS > + > +#ifdef CONFIG_KVM_DEBUG_FS > + > +#include <linux/debugfs.h> > + > +#endif /* CONFIG_KVM_DEBUG_FS */ > + > +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ > + > #define MMU_QUEUE_SIZE 1024 > > static int kvmapf = 1; > @@ -627,3 +637,240 @@ static __init int activate_jump_labels(void) > return 0; > } > arch_initcall(activate_jump_labels); > + > +#ifdef CONFIG_PARAVIRT_SPINLOCKS > + > +#ifdef CONFIG_KVM_DEBUG_FS > + > +static struct kvm_spinlock_stats > +{ > + u32 taken_slow; > + u32 taken_slow_pickup; > + > + u32 released_slow; > + u32 released_slow_kicked; > + > +#define HISTO_BUCKETS 30 > + u32 histo_spin_blocked[HISTO_BUCKETS+1]; > + > + u64 time_blocked; > +} spinlock_stats; > + > +static u8 zero_stats; > + > +static inline void check_zero(void) > +{ > + if (unlikely(zero_stats)) { > + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); > + zero_stats = 0; > + } > +} > + > +#define ADD_STATS(elem, val) \ > + do { check_zero(); spinlock_stats.elem += (val); } while (0) > + > +static inline u64 spin_time_start(void) > +{ > + return sched_clock(); > +} > + > +static void __spin_time_accum(u64 delta, u32 *array) > +{ > + unsigned index = ilog2(delta); > + > + check_zero(); > + > + if (index < HISTO_BUCKETS) > + array[index]++; > + else > + array[HISTO_BUCKETS]++; > +} > + > +static inline void spin_time_accum_blocked(u64 start) > +{ > + u32 delta = sched_clock() - start; > + > + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); > + spinlock_stats.time_blocked += delta; > +} > + > +static struct dentry *d_spin_debug; > +static struct dentry *d_kvm_debug; > + > +struct dentry *kvm_init_debugfs(void) > +{ > + d_kvm_debug = debugfs_create_dir("kvm", NULL); > + if (!d_kvm_debug) > + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); > + > + return d_kvm_debug; > +} > + > +static int __init kvm_spinlock_debugfs(void) > +{ > + struct dentry *d_kvm = kvm_init_debugfs(); > + > + if (d_kvm == NULL) > + return -ENOMEM; > + > + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); > + > + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); > + > + debugfs_create_u32("taken_slow", 0444, d_spin_debug, > + &spinlock_stats.taken_slow); > + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, > + &spinlock_stats.taken_slow_pickup); > + > + debugfs_create_u32("released_slow", 0444, d_spin_debug, > + &spinlock_stats.released_slow); > + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, > + &spinlock_stats.released_slow_kicked); > + > + debugfs_create_u64("time_blocked", 0444, d_spin_debug, > + &spinlock_stats.time_blocked); > + > + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, > + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); > + > + return 0; > +} > +fs_initcall(kvm_spinlock_debugfs); > +#else /* !CONFIG_KVM_DEBUG_FS */ > +#define TIMEOUT (1 << 10) > +#define ADD_STATS(elem, val) do { (void)(val); } while (0) > + > +static inline u64 spin_time_start(void) > +{ > + return 0; > +} > + > +static inline void spin_time_accum_blocked(u64 start) > +{ > +} > +#endif /* CONFIG_KVM_DEBUG_FS */ > + > +struct kvm_lock_waiting { > + struct arch_spinlock *lock; > + __ticket_t want; > +}; > + > +/* cpus 'waiting' on a spinlock to become available */ > +static cpumask_t waiting_cpus; > + > +/* Track spinlock on which a cpu is waiting */ > +static DEFINE_PER_CPU(struct kvm_lock_waiting, lock_waiting); > + > +static inline void kvm_wait_for_kick(void) > +{ > + kvm_hypercall0(KVM_HC_WAIT_FOR_KICK); > +} > + > +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) > +{ > + struct kvm_lock_waiting *w = &__get_cpu_var(lock_waiting); > + int cpu = smp_processor_id(); > + u64 start; > + unsigned long flags; > + > + start = spin_time_start(); > + > + /* > + * Make sure an interrupt handler can't upset things in a > + * partially setup state. > + */ > + local_irq_save(flags); > + > + /* > + * The ordering protocol on this is that the "lock" pointer > + * may only be set non-NULL if the "want" ticket is correct. > + * If we're updating "want", we must first clear "lock". > + */ > + w->lock = NULL; > + smp_wmb(); > + w->want = want; > + smp_wmb(); > + w->lock = lock; > + > + ADD_STATS(taken_slow, 1); > + > + /* > + * This uses set_bit, which is atomic but we should not rely on its > + * reordering gurantees. So barrier is needed after this call. > + */ > + cpumask_set_cpu(cpu, &waiting_cpus); > + > + barrier(); > + > + /* > + * Mark entry to slowpath before doing the pickup test to make > + * sure we don't deadlock with an unlocker. > + */ > + __ticket_enter_slowpath(lock); > + > + /* > + * check again make sure it didn't become free while > + * we weren't looking. > + */ > + if (ACCESS_ONCE(lock->tickets.head) == want) { > + ADD_STATS(taken_slow_pickup, 1); > + goto out; > + } > + > + /* Allow interrupts while blocked */ > + local_irq_restore(flags); > + > + kvm_wait_for_kick(); > + > + local_irq_save(flags); > +out: > + cpumask_clear_cpu(cpu, &waiting_cpus); > + w->lock = NULL; > + local_irq_restore(flags); > + spin_time_accum_blocked(start); > +} > +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); > + > +/* Kick a cpu */ > +static inline void kvm_kick_cpu(int cpu) > +{ > + kvm_hypercall1(KVM_HC_KICK_CPU, cpu); > +} > + > +/* Kick vcpu waiting on @lock->head to reach value @ticket */ > +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) > +{ > + int cpu; > + > + ADD_STATS(released_slow, 1); > + > + for_each_cpu(cpu, &waiting_cpus) { > + const struct kvm_lock_waiting *w = &per_cpu(lock_waiting, cpu); > + if (ACCESS_ONCE(w->lock) == lock && > + ACCESS_ONCE(w->want) == ticket) { > + ADD_STATS(released_slow_kicked, 1); > + kvm_kick_cpu(cpu); > + break; > + } > + } > +} > + > +/* > + * Setup pv_lock_ops to exploit KVM_FEATURE_WAIT_FOR_KICK if present. > + * This needs to be setup really early in boot, before the first call to > + * spinlock is issued! Actually, it doesn't matter that much. The in-memory format is the same for regular and PV spinlocks, and the PV paths only come into play if the "slowpath" flag is set in the lock, which it never will be by the non-PV code. In principle, you could defer initializing PV ticketlocks until some arbitrarily late point if you notice that the system is oversubscribed enough to require it. The main constraint at present is that you need to update the pv_lock_ops structure before pvops patching happens, or you won't see any effect from making changes. > + */ > +void __init kvm_guest_early_init(void) > +{ > + if (!kvm_para_available()) > + return; > + /* Does host kernel support KVM_FEATURE_WAIT_FOR_KICK? */ > + if (!kvm_para_has_feature(KVM_FEATURE_WAIT_FOR_KICK)) > + return; > + > + jump_label_inc(¶virt_ticketlocks_enabled); > + > + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); > + pv_lock_ops.unlock_kick = kvm_unlock_kick; > +} > +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ J -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html