On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote: > This patch introduces new halt polling functionality into the kvm_hv kernel > module. When a vcore is idle it will poll for some period of time before > scheduling itself out. Some wording on why you cannot use the common code might be useful. > > When all of the runnable vcpus on a vcore have ceded (and thus the vcore is > idle) we schedule ourselves out to allow something else to run. In the > event that we need to wake up very quickly (for example an interrupt > arrives), we are required to wait until we get scheduled again. > > Implement halt polling so that when a vcore is idle, and before scheduling > ourselves, we poll for vcpus in the runnable_threads list which have > pending exceptions or which leave the ceded state. If we poll successfully > then we can get back into the guest very quickly without ever scheduling > ourselves, otherwise we schedule ourselves out as before. > > Testing of this patch with a TCP round robin test between two guests with > virtio network interfaces has found a decrease in round trip time of ~15us > on average. A performance gain is only seen when going out of and > back into the guest often and quickly, otherwise there is no net benefit > from the polling. The polling interval is adjusted such that when we are > often scheduled out for long periods of time it is reduced, and when we > often poll successfully it is increased. The rate at which the polling > interval increases or decreases, and the maximum polling interval, can > be set through module parameters. > > Based on the implementation in the generic kvm module by Wanpeng Li and > Paolo Bonzini, and on direction from Paul Mackerras. > > Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx> > --- > arch/powerpc/include/asm/kvm_book3s.h | 1 + > arch/powerpc/include/asm/kvm_host.h | 1 + > arch/powerpc/kvm/book3s_hv.c | 116 ++++++++++++++++++++++++++++++---- > arch/powerpc/kvm/trace_hv.h | 22 +++++++ > 4 files changed, 126 insertions(+), 14 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h > index 151f817..c261f52 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -102,6 +102,7 @@ struct kvmppc_vcore { > ulong pcr; > ulong dpdes; /* doorbell state (POWER8) */ > ulong conferring_threads; > + unsigned int halt_poll_ns; > }; > > struct kvmppc_vcpu_book3s { > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index 02d06e9..610f393 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -294,6 +294,7 @@ struct kvm_arch { > #define VCORE_SLEEPING 3 > #define VCORE_RUNNING 4 > #define VCORE_EXITING 5 > +#define VCORE_POLLING 6 > > /* > * Struct used to manage memory for a virtual processor area > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 3bcf9e6..a9de1d4 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, > MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); > #endif > > +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ > +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; > +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); > +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); > + > +/* Factor by which the vcore halt poll interval is grown, default is to double > + */ > +static unsigned int halt_poll_ns_grow = 2; > +module_param(halt_poll_ns_grow, int, S_IRUGO); > +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); > + > +/* Factor by which the vcore halt poll interval is shrunk, default is to reset > + */ > +static unsigned int halt_poll_ns_shrink; > +module_param(halt_poll_ns_shrink, int, S_IRUGO); > +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); > + > static void kvmppc_end_cede(struct kvm_vcpu *vcpu); > static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); > > @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, > finish_wait(&vcpu->arch.cpu_run, &wait); > } > > +static void grow_halt_poll_ns(struct kvmppc_vcore *vc) > +{ > + /* 10us base */ > + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) > + vc->halt_poll_ns = 10000; > + else > + vc->halt_poll_ns *= halt_poll_ns_grow; > + > + if (vc->halt_poll_ns > halt_poll_max_ns) > + vc->halt_poll_ns = halt_poll_max_ns; > +} > + > +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) > +{ > + if (halt_poll_ns_shrink == 0) > + vc->halt_poll_ns = 0; > + else > + vc->halt_poll_ns /= halt_poll_ns_shrink; > +} > + > +/* Check to see if any of the runnable vcpus on the vcore have pending > + * exceptions or are no longer ceded > + */ > +static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) > +{ > + struct kvm_vcpu *vcpu; > + int i; > + > + for_each_runnable_thread(i, vcpu, vc) { > + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) > + return 1; > + } > + > + return 0; > +} > + > /* > * All the vcpus in this vcore are idle, so wait for a decrementer > * or external interrupt to one of the vcpus. vc->lock is held. > */ > static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) > { > - struct kvm_vcpu *vcpu; > - int do_sleep = 1, i; > + int do_sleep = 1; > + ktime_t cur, start; > + u64 block_ns; > DECLARE_SWAITQUEUE(wait); > > - prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); > + /* Poll for pending exceptions and ceded state */ > + cur = start = ktime_get(); > + if (vc->halt_poll_ns) { > + ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns); > > - /* > - * Check one last time for pending exceptions and ceded state after > - * we put ourselves on the wait queue > - */ > - for_each_runnable_thread(i, vcpu, vc) { > - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) { > - do_sleep = 0; > - break; > - } > + vc->vcore_state = VCORE_POLLING; > + spin_unlock(&vc->lock); > + > + do { > + if (kvmppc_vcore_check_block(vc)) { > + do_sleep = 0; > + break; > + } > + cur = ktime_get(); > + } while (ktime_before(cur, stop)); > + > + spin_lock(&vc->lock); > + vc->vcore_state = VCORE_INACTIVE; > + > + if (!do_sleep) > + goto out; > } > > - if (!do_sleep) { > + prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); > + > + if (kvmppc_vcore_check_block(vc)) { > finish_swait(&vc->wq, &wait); > - return; > + do_sleep = 0; > + goto out; > } > > vc->vcore_state = VCORE_SLEEPING; > @@ -2656,6 +2723,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) > spin_lock(&vc->lock); > vc->vcore_state = VCORE_INACTIVE; > trace_kvmppc_vcore_blocked(vc, 1); > + > + cur = ktime_get(); > + > +out: > + block_ns = ktime_to_ns(cur) - ktime_to_ns(start); > + > + /* Adjust poll time */ > + if (halt_poll_max_ns) { > + if (block_ns <= vc->halt_poll_ns) > + ; > + /* We slept and blocked for longer than the max halt time */ > + else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns) > + shrink_halt_poll_ns(vc); > + /* We slept and our poll time is too small */ > + else if (vc->halt_poll_ns < halt_poll_max_ns && > + block_ns < halt_poll_max_ns) > + grow_halt_poll_ns(vc); > + } else > + vc->halt_poll_ns = 0; > + > + trace_kvmppc_vcore_wakeup(do_sleep, block_ns); > } > > static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h > index 33d9daf..fb21990 100644 > --- a/arch/powerpc/kvm/trace_hv.h > +++ b/arch/powerpc/kvm/trace_hv.h > @@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked, > __entry->runner_vcpu, __entry->n_runnable, __entry->tgid) > ); > > +TRACE_EVENT(kvmppc_vcore_wakeup, > + TP_PROTO(int do_sleep, __u64 ns), > + > + TP_ARGS(do_sleep, ns), > + > + TP_STRUCT__entry( > + __field(__u64, ns) > + __field(int, waited) > + __field(pid_t, tgid) > + ), > + > + TP_fast_assign( > + __entry->ns = ns; > + __entry->waited = do_sleep; > + __entry->tgid = current->tgid; > + ), > + > + TP_printk("%s time %lld ns, tgid=%d", > + __entry->waited ? "wait" : "poll", > + __entry->ns, __entry->tgid) > +); > + > TRACE_EVENT(kvmppc_run_vcpu_enter, > TP_PROTO(struct kvm_vcpu *vcpu), > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html