Re: [PATCH v4 2/2] KVM: x86: Include host suspended time in steal time

Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> · Sat, 1 Mar 2025 12:21:28 -0500



On Fri, Feb 21, 2025 at 02:39:27PM +0900, Suleiman Souhlal wrote:
> When the host resumes from a suspend, the guest thinks any task
> that was running during the suspend ran for a long time, even though
> the effective run time was much shorter, which can end up having
> negative effects with scheduling.
> 
> To mitigate this issue, the time that the host was suspended is included
> in steal time, which lets the guest can subtract the duration from the

s/can//
> tasks' runtime.
> 
> In order to implement this behavior, once the suspend notifier fires,
> vCPUs trying to run block until the resume notifier finishes. This is

s/run/run will/
> because the freezing of userspace tasks happens between these two points,
Full stop at the end of that                                              ^
> which means that vCPUs could otherwise run and get their suspend steal
> time misaccounted, particularly if a vCPU would run after resume before
> the resume notifier.

s/notifier/notifier fires/

> Incidentally, doing this also addresses a potential race with the
> suspend notifier setting PVCLOCK_GUEST_STOPPED, which could then get
> cleared before the suspend actually happened.
> 
> One potential caveat is that in the case of a suspend happening during
> a VM migration, the suspend time might not be accounted.

s/accounted/accounted for./
> A workaround would be for the VMM to ensure that the guest is entered
> with KVM_RUN after resuming from suspend.

So ..does that mean there is a QEMU patch as well?
> 
> Signed-off-by: Suleiman Souhlal <suleiman@xxxxxxxxxx>
> ---
>  Documentation/virt/kvm/x86/msr.rst | 10 ++++--
>  arch/x86/include/asm/kvm_host.h    |  6 ++++
>  arch/x86/kvm/x86.c                 | 51 ++++++++++++++++++++++++++++++
>  3 files changed, 65 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst
> index 3aecf2a70e7b43..48f2a8ca519548 100644
> --- a/Documentation/virt/kvm/x86/msr.rst
> +++ b/Documentation/virt/kvm/x86/msr.rst
> @@ -294,8 +294,14 @@ data:
>  
>  	steal:
>  		the amount of time in which this vCPU did not run, in
> -		nanoseconds. Time during which the vcpu is idle, will not be
> -		reported as steal time.
> +		nanoseconds. This includes the time during which the host is
> +		suspended. Time during which the vcpu is idle, might not be
> +		reported as steal time. The case where the host suspends
> +		during a VM migration might not be accounted if VCPUs aren't
> +		entered post-resume, because KVM does not currently support
> +		suspend/resuming the associated metadata. A workaround would
> +		be for the VMM to ensure that the guest is entered with
> +		KVM_RUN after resuming from suspend.
>  
>  	preempted:
>  		indicate the vCPU who owns this struct is running or
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 452dd0204609af..007656ceac9a71 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -124,6 +124,7 @@
>  #define KVM_REQ_HV_TLB_FLUSH \
>  	KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE	KVM_ARCH_REQ(34)
> +#define KVM_REQ_WAIT_FOR_RESUME		KVM_ARCH_REQ(35)
>  
>  #define CR0_RESERVED_BITS                                               \
>  	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> @@ -916,8 +917,13 @@ struct kvm_vcpu_arch {
>  
>  	struct {
>  		u8 preempted;
> +		bool host_suspended;
>  		u64 msr_val;
>  		u64 last_steal;
> +		u64 last_suspend;
> +		u64 suspend_ns;
> +		u64 last_suspend_ns;
> +		wait_queue_head_t resume_waitq;
>  		struct gfn_to_hva_cache cache;
>  	} st;
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 06464ec0d1c8d2..f34edcf77cca0a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3717,6 +3717,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
>  	steal += current->sched_info.run_delay -
>  		vcpu->arch.st.last_steal;
>  	vcpu->arch.st.last_steal = current->sched_info.run_delay;
> +	steal += vcpu->arch.st.suspend_ns - vcpu->arch.st.last_suspend_ns;
> +	vcpu->arch.st.last_suspend_ns = vcpu->arch.st.suspend_ns;
>  	unsafe_put_user(steal, &st->steal, out);
>  
>  	version += 1;
> @@ -6930,6 +6932,19 @@ long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
>  }
>  #endif
>  
> +static void wait_for_resume(struct kvm_vcpu *vcpu)
> +{
> +	wait_event_interruptible(vcpu->arch.st.resume_waitq,
> +	    vcpu->arch.st.host_suspended == 0);
> +
> +	/*
> +	 * This might happen if we blocked here before the freezing of tasks
> +	 * and we get woken up by the freezer.
> +	 */
> +	if (vcpu->arch.st.host_suspended)
> +		kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu);
> +}
> +
>  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
>  static int kvm_arch_suspend_notifier(struct kvm *kvm)
>  {
> @@ -6939,6 +6954,19 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
>  
>  	mutex_lock(&kvm->lock);
>  	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		vcpu->arch.st.last_suspend = ktime_get_boottime_ns();
> +		/*
> +		 * Tasks get thawed before the resume notifier has been called
> +		 * so we need to block vCPUs until the resume notifier has run.
> +		 * Otherwise, suspend steal time might get applied too late,
> +		 * and get accounted to the wrong guest task.
> +		 * This also ensures that the guest paused bit set below
> +		 * doesn't get checked and cleared before the host actually
> +		 * suspends.
> +		 */
> +		vcpu->arch.st.host_suspended = 1;
> +		kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu);
> +
>  		if (!vcpu->arch.pv_time.active)
>  			continue;
>  
> @@ -6954,12 +6982,32 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
>  	return ret ? NOTIFY_BAD : NOTIFY_DONE;
>  }
>  
> +static int kvm_arch_resume_notifier(struct kvm *kvm)
> +{
> +	struct kvm_vcpu *vcpu;
> +	unsigned long i;
> +
> +	mutex_lock(&kvm->lock);
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		vcpu->arch.st.host_suspended = 0;
> +		vcpu->arch.st.suspend_ns += ktime_get_boottime_ns() -
> +		    vcpu->arch.st.last_suspend;
> +		wake_up_interruptible(&vcpu->arch.st.resume_waitq);
> +	}
> +	mutex_unlock(&kvm->lock);
> +
> +	return NOTIFY_DONE;
> +}
> +
>  int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  {
>  	switch (state) {
>  	case PM_HIBERNATION_PREPARE:
>  	case PM_SUSPEND_PREPARE:
>  		return kvm_arch_suspend_notifier(kvm);
> +	case PM_POST_HIBERNATION:
> +	case PM_POST_SUSPEND:
> +		return kvm_arch_resume_notifier(kvm);
>  	}
>  
>  	return NOTIFY_DONE;
> @@ -10813,6 +10861,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  			r = 1;
>  			goto out;
>  		}
> +		if (kvm_check_request(KVM_REQ_WAIT_FOR_RESUME, vcpu))
> +			wait_for_resume(vcpu);
>  		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
>  			record_steal_time(vcpu);
>  		if (kvm_check_request(KVM_REQ_PMU, vcpu))
> @@ -12341,6 +12391,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>  	if (r)
>  		goto free_guest_fpu;
>  
> +	init_waitqueue_head(&vcpu->arch.st.resume_waitq);
>  	kvm_xen_init_vcpu(vcpu);
>  	vcpu_load(vcpu);
>  	kvm_vcpu_after_set_cpuid(vcpu);
> -- 
> 2.48.1.601.g30ceb7b040-goog
> 
>