Add a helper routine to scheduler/core.c to allow the kvm module to retrieve the cpu hardlimit settings. The values will be used to set up a timer that is used to separate the consigned from the steal time. Signed-off-by: Michael Wolf <mjw@xxxxxxxxxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 9 ++++++ arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++++++++++++++- kernel/sched/core.c | 20 +++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fe5a37b..9518613 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -355,6 +355,15 @@ struct kvm_vcpu_arch { bool tpr_access_reporting; /* + * timer used to determine if the time should be counted as + * steal time or consigned time. + */ + struct hrtimer steal_timer; + u64 current_consigned; + s64 consigned_quota; + s64 consigned_period; + + /* * Paging state of the vcpu * * If the vcpu runs in guest mode with two level paging this still saves diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51b63d1..79d144d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1848,13 +1848,32 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void accumulate_steal_time(struct kvm_vcpu *vcpu) { u64 delta; + u64 steal_delta; + u64 consigned_delta; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - vcpu->arch.st.accum_steal = delta; + + /* split the delta into steal and consigned */ + if (vcpu->arch.current_consigned < vcpu->arch.consigned_quota) { + vcpu->arch.current_consigned += delta; + if (vcpu->arch.current_consigned > vcpu->arch.consigned_quota) { + steal_delta = vcpu->arch.current_consigned + - vcpu->arch.consigned_quota; + consigned_delta = delta - steal_delta; + } else { + consigned_delta = delta; + steal_delta = 0; + } + } else { + consigned_delta = 0; + steal_delta = delta; + } + vcpu->arch.st.accum_steal = steal_delta; + vcpu->arch.st.accum_consigned = consigned_delta; } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -2629,8 +2648,35 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); } +extern int sched_use_hard_capping(int cpuid, int num_vcpus, s64 *quota, + s64 *period); +enum hrtimer_restart steal_timer_fn(struct hrtimer *data) +{ + struct kvm_vcpu *vcpu; + struct kvm *kvm; + int num_vcpus; + ktime_t now; + + vcpu = container_of(data, struct kvm_vcpu, arch.steal_timer); + kvm = vcpu->kvm; + num_vcpus = atomic_read(&kvm->online_vcpus); + sched_use_hard_capping(vcpu->cpu, num_vcpus, + &vcpu->arch.consigned_quota, + &vcpu->arch.consigned_period); + vcpu->arch.current_consigned = 0; + now = ktime_get(); + hrtimer_forward(&vcpu->arch.steal_timer, now, + ktime_set(0, vcpu->arch.consigned_period)); + + return HRTIMER_RESTART; +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + struct kvm *kvm; + int num_vcpus; + ktime_t ktime; + /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { if (kvm_x86_ops->has_wbinvd_exit()) @@ -2670,6 +2716,18 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; } + /* Initialize and start a timer to capture steal and consigned time */ + kvm = vcpu->kvm; + num_vcpus = atomic_read(&kvm->online_vcpus); + num_vcpus = (num_vcpus == 0) ? 1 : num_vcpus; + sched_use_hard_capping(vcpu->cpu, num_vcpus, + &vcpu->arch.consigned_quota, + &vcpu->arch.consigned_period); + hrtimer_init(&vcpu->arch.steal_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vcpu->arch.steal_timer.function = &steal_timer_fn; + ktime = ktime_set(0, vcpu->arch.consigned_period); + hrtimer_start(&vcpu->arch.steal_timer, ktime, HRTIMER_MODE_REL); accumulate_steal_time(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); @@ -2680,6 +2738,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = native_read_tsc(); + hrtimer_cancel(&vcpu->arch.steal_timer); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -6685,6 +6744,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; + hrtimer_cancel(&vcpu->arch.steal_timer); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index efc2652..133ee47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8154,6 +8154,26 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +/* + * return 1 if the scheduler is using some form of hard capping + * return 0 if there is no capping configured. + */ +int sched_use_hard_capping(int cpuid, int num_cpus, long *quota, long *period) +{ + struct rq *rq = cpu_rq(cpuid); + struct task_struct *curr = rq->curr; + struct task_group *tg = curr->sched_task_group; + long total_time; + + *period = tg_get_cfs_period(tg); + if (*quota == RUNTIME_INF || *quota == -1) + return 0; + *quota = jiffies_to_usecs(tg_get_cfs_quota(tg)) / num_cpus; + total_time = jiffies_to_usecs(*period); + *quota = total_time - *quota; + return 1; +} +EXPORT_SYMBOL_GPL(sched_use_hard_capping); struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html