On 20/07/16 04:58, David Matlack wrote: > On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singh > <sjitindarsingh@xxxxxxxxx> wrote: >> This patch introduces new halt polling functionality into the kvm_hv kernel >> module. When a vcore is idle it will poll for some period of time before >> scheduling itself out. >> >> When all of the runnable vcpus on a vcore have ceded (and thus the vcore is >> idle) we schedule ourselves out to allow something else to run. In the >> event that we need to wake up very quickly (for example an interrupt >> arrives), we are required to wait until we get scheduled again. >> >> Implement halt polling so that when a vcore is idle, and before scheduling >> ourselves, we poll for vcpus in the runnable_threads list which have >> pending exceptions or which leave the ceded state. If we poll successfully >> then we can get back into the guest very quickly without ever scheduling >> ourselves, otherwise we schedule ourselves out as before. >> >> Testing of this patch with a TCP round robin test between two guests with >> virtio network interfaces has found a decrease in round trip time of ~15us >> on average. A performance gain is only seen when going out of and >> back into the guest often and quickly, otherwise there is no net benefit >> from the polling. The polling interval is adjusted such that when we are >> often scheduled out for long periods of time it is reduced, and when we >> often poll successfully it is increased. The rate at which the polling >> interval increases or decreases, and the maximum polling interval, can >> be set through module parameters. >> >> Based on the implementation in the generic kvm module by Wanpeng Li and >> Paolo Bonzini, and on direction from Paul Mackerras. >> >> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@xxxxxxxxx> >> --- >> arch/powerpc/include/asm/kvm_book3s.h | 1 + >> arch/powerpc/include/asm/kvm_host.h | 1 + >> arch/powerpc/kvm/book3s_hv.c | 116 ++++++++++++++++++++++++++++++---- >> arch/powerpc/kvm/trace_hv.h | 22 +++++++ >> 4 files changed, 126 insertions(+), 14 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h >> index 151f817..c261f52 100644 >> --- a/arch/powerpc/include/asm/kvm_book3s.h >> +++ b/arch/powerpc/include/asm/kvm_book3s.h >> @@ -102,6 +102,7 @@ struct kvmppc_vcore { >> ulong pcr; >> ulong dpdes; /* doorbell state (POWER8) */ >> ulong conferring_threads; >> + unsigned int halt_poll_ns; >> }; >> >> struct kvmppc_vcpu_book3s { >> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h >> index 02d06e9..610f393 100644 >> --- a/arch/powerpc/include/asm/kvm_host.h >> +++ b/arch/powerpc/include/asm/kvm_host.h >> @@ -294,6 +294,7 @@ struct kvm_arch { >> #define VCORE_SLEEPING 3 >> #define VCORE_RUNNING 4 >> #define VCORE_EXITING 5 >> +#define VCORE_POLLING 6 >> >> /* >> * Struct used to manage memory for a virtual processor area >> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c >> index 3bcf9e6..a9de1d4 100644 >> --- a/arch/powerpc/kvm/book3s_hv.c >> +++ b/arch/powerpc/kvm/book3s_hv.c >> @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, >> MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); >> #endif >> >> +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ >> +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; >> +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); >> +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); >> + >> +/* Factor by which the vcore halt poll interval is grown, default is to double >> + */ >> +static unsigned int halt_poll_ns_grow = 2; >> +module_param(halt_poll_ns_grow, int, S_IRUGO); >> +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); >> + >> +/* Factor by which the vcore halt poll interval is shrunk, default is to reset >> + */ >> +static unsigned int halt_poll_ns_shrink; >> +module_param(halt_poll_ns_shrink, int, S_IRUGO); >> +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); >> + >> static void kvmppc_end_cede(struct kvm_vcpu *vcpu); >> static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); >> >> @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, >> finish_wait(&vcpu->arch.cpu_run, &wait); >> } >> >> +static void grow_halt_poll_ns(struct kvmppc_vcore *vc) >> +{ >> + /* 10us base */ >> + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) >> + vc->halt_poll_ns = 10000; >> + else >> + vc->halt_poll_ns *= halt_poll_ns_grow; >> + >> + if (vc->halt_poll_ns > halt_poll_max_ns) >> + vc->halt_poll_ns = halt_poll_max_ns; >> +} >> + >> +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) >> +{ >> + if (halt_poll_ns_shrink == 0) >> + vc->halt_poll_ns = 0; >> + else >> + vc->halt_poll_ns /= halt_poll_ns_shrink; >> +} >> + >> +/* Check to see if any of the runnable vcpus on the vcore have pending >> + * exceptions or are no longer ceded >> + */ >> +static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) >> +{ >> + struct kvm_vcpu *vcpu; >> + int i; >> + >> + for_each_runnable_thread(i, vcpu, vc) { >> + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) >> + return 1; >> + } >> + >> + return 0; >> +} >> + >> /* >> * All the vcpus in this vcore are idle, so wait for a decrementer >> * or external interrupt to one of the vcpus. vc->lock is held. >> */ >> static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) >> { >> - struct kvm_vcpu *vcpu; >> - int do_sleep = 1, i; >> + int do_sleep = 1; >> + ktime_t cur, start; >> + u64 block_ns; >> DECLARE_SWAITQUEUE(wait); >> >> - prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); >> + /* Poll for pending exceptions and ceded state */ >> + cur = start = ktime_get(); >> + if (vc->halt_poll_ns) { >> + ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns); >> >> - /* >> - * Check one last time for pending exceptions and ceded state after >> - * we put ourselves on the wait queue >> - */ >> - for_each_runnable_thread(i, vcpu, vc) { >> - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) { >> - do_sleep = 0; >> - break; >> - } >> + vc->vcore_state = VCORE_POLLING; >> + spin_unlock(&vc->lock); >> + >> + do { >> + if (kvmppc_vcore_check_block(vc)) { >> + do_sleep = 0; >> + break; >> + } >> + cur = ktime_get(); >> + } while (ktime_before(cur, stop)); > The generic halt-polling implementation only polls if there are no > other threads waiting to run (single_task_running() == 1). Should/Can > we do the same here? Thanks for catching that. Yes I think we should. > >> + >> + spin_lock(&vc->lock); >> + vc->vcore_state = VCORE_INACTIVE; >> + >> + if (!do_sleep) >> + goto out; >> } >> >> - if (!do_sleep) { >> + prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); >> + >> + if (kvmppc_vcore_check_block(vc)) { >> finish_swait(&vc->wq, &wait); >> - return; >> + do_sleep = 0; >> + goto out; >> } >> >> vc->vcore_state = VCORE_SLEEPING; >> @@ -2656,6 +2723,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) >> spin_lock(&vc->lock); >> vc->vcore_state = VCORE_INACTIVE; >> trace_kvmppc_vcore_blocked(vc, 1); >> + >> + cur = ktime_get(); >> + >> +out: >> + block_ns = ktime_to_ns(cur) - ktime_to_ns(start); >> + >> + /* Adjust poll time */ >> + if (halt_poll_max_ns) { >> + if (block_ns <= vc->halt_poll_ns) >> + ; >> + /* We slept and blocked for longer than the max halt time */ >> + else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns) >> + shrink_halt_poll_ns(vc); >> + /* We slept and our poll time is too small */ >> + else if (vc->halt_poll_ns < halt_poll_max_ns && >> + block_ns < halt_poll_max_ns) >> + grow_halt_poll_ns(vc); >> + } else >> + vc->halt_poll_ns = 0; >> + >> + trace_kvmppc_vcore_wakeup(do_sleep, block_ns); >> } >> >> static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) >> diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h >> index 33d9daf..fb21990 100644 >> --- a/arch/powerpc/kvm/trace_hv.h >> +++ b/arch/powerpc/kvm/trace_hv.h >> @@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked, >> __entry->runner_vcpu, __entry->n_runnable, __entry->tgid) >> ); >> >> +TRACE_EVENT(kvmppc_vcore_wakeup, >> + TP_PROTO(int do_sleep, __u64 ns), >> + >> + TP_ARGS(do_sleep, ns), >> + >> + TP_STRUCT__entry( >> + __field(__u64, ns) >> + __field(int, waited) >> + __field(pid_t, tgid) >> + ), >> + >> + TP_fast_assign( >> + __entry->ns = ns; >> + __entry->waited = do_sleep; >> + __entry->tgid = current->tgid; >> + ), >> + >> + TP_printk("%s time %lld ns, tgid=%d", >> + __entry->waited ? "wait" : "poll", >> + __entry->ns, __entry->tgid) >> +); >> + >> TRACE_EVENT(kvmppc_run_vcpu_enter, >> TP_PROTO(struct kvm_vcpu *vcpu), >> >> -- >> 2.5.5 >> -- To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html