On Fri, Feb 21, 2025 at 02:39:27PM +0900, Suleiman Souhlal wrote: > When the host resumes from a suspend, the guest thinks any task > that was running during the suspend ran for a long time, even though > the effective run time was much shorter, which can end up having > negative effects with scheduling. > > To mitigate this issue, the time that the host was suspended is included > in steal time, which lets the guest can subtract the duration from the s/can// > tasks' runtime. > > In order to implement this behavior, once the suspend notifier fires, > vCPUs trying to run block until the resume notifier finishes. This is s/run/run will/ > because the freezing of userspace tasks happens between these two points, Full stop at the end of that ^ > which means that vCPUs could otherwise run and get their suspend steal > time misaccounted, particularly if a vCPU would run after resume before > the resume notifier. s/notifier/notifier fires/ > Incidentally, doing this also addresses a potential race with the > suspend notifier setting PVCLOCK_GUEST_STOPPED, which could then get > cleared before the suspend actually happened. > > One potential caveat is that in the case of a suspend happening during > a VM migration, the suspend time might not be accounted. s/accounted/accounted for./ > A workaround would be for the VMM to ensure that the guest is entered > with KVM_RUN after resuming from suspend. So ..does that mean there is a QEMU patch as well? > > Signed-off-by: Suleiman Souhlal <suleiman@xxxxxxxxxx> > --- > Documentation/virt/kvm/x86/msr.rst | 10 ++++-- > arch/x86/include/asm/kvm_host.h | 6 ++++ > arch/x86/kvm/x86.c | 51 ++++++++++++++++++++++++++++++ > 3 files changed, 65 insertions(+), 2 deletions(-) > > diff --git a/Documentation/virt/kvm/x86/msr.rst b/Documentation/virt/kvm/x86/msr.rst > index 3aecf2a70e7b43..48f2a8ca519548 100644 > --- a/Documentation/virt/kvm/x86/msr.rst > +++ b/Documentation/virt/kvm/x86/msr.rst > @@ -294,8 +294,14 @@ data: > > steal: > the amount of time in which this vCPU did not run, in > - nanoseconds. Time during which the vcpu is idle, will not be > - reported as steal time. > + nanoseconds. This includes the time during which the host is > + suspended. Time during which the vcpu is idle, might not be > + reported as steal time. The case where the host suspends > + during a VM migration might not be accounted if VCPUs aren't > + entered post-resume, because KVM does not currently support > + suspend/resuming the associated metadata. A workaround would > + be for the VMM to ensure that the guest is entered with > + KVM_RUN after resuming from suspend. > > preempted: > indicate the vCPU who owns this struct is running or > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 452dd0204609af..007656ceac9a71 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -124,6 +124,7 @@ > #define KVM_REQ_HV_TLB_FLUSH \ > KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) > #define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) > +#define KVM_REQ_WAIT_FOR_RESUME KVM_ARCH_REQ(35) > > #define CR0_RESERVED_BITS \ > (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ > @@ -916,8 +917,13 @@ struct kvm_vcpu_arch { > > struct { > u8 preempted; > + bool host_suspended; > u64 msr_val; > u64 last_steal; > + u64 last_suspend; > + u64 suspend_ns; > + u64 last_suspend_ns; > + wait_queue_head_t resume_waitq; > struct gfn_to_hva_cache cache; > } st; > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 06464ec0d1c8d2..f34edcf77cca0a 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3717,6 +3717,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) > steal += current->sched_info.run_delay - > vcpu->arch.st.last_steal; > vcpu->arch.st.last_steal = current->sched_info.run_delay; > + steal += vcpu->arch.st.suspend_ns - vcpu->arch.st.last_suspend_ns; > + vcpu->arch.st.last_suspend_ns = vcpu->arch.st.suspend_ns; > unsafe_put_user(steal, &st->steal, out); > > version += 1; > @@ -6930,6 +6932,19 @@ long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, > } > #endif > > +static void wait_for_resume(struct kvm_vcpu *vcpu) > +{ > + wait_event_interruptible(vcpu->arch.st.resume_waitq, > + vcpu->arch.st.host_suspended == 0); > + > + /* > + * This might happen if we blocked here before the freezing of tasks > + * and we get woken up by the freezer. > + */ > + if (vcpu->arch.st.host_suspended) > + kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu); > +} > + > #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER > static int kvm_arch_suspend_notifier(struct kvm *kvm) > { > @@ -6939,6 +6954,19 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > > mutex_lock(&kvm->lock); > kvm_for_each_vcpu(i, vcpu, kvm) { > + vcpu->arch.st.last_suspend = ktime_get_boottime_ns(); > + /* > + * Tasks get thawed before the resume notifier has been called > + * so we need to block vCPUs until the resume notifier has run. > + * Otherwise, suspend steal time might get applied too late, > + * and get accounted to the wrong guest task. > + * This also ensures that the guest paused bit set below > + * doesn't get checked and cleared before the host actually > + * suspends. > + */ > + vcpu->arch.st.host_suspended = 1; > + kvm_make_request(KVM_REQ_WAIT_FOR_RESUME, vcpu); > + > if (!vcpu->arch.pv_time.active) > continue; > > @@ -6954,12 +6982,32 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) > return ret ? NOTIFY_BAD : NOTIFY_DONE; > } > > +static int kvm_arch_resume_notifier(struct kvm *kvm) > +{ > + struct kvm_vcpu *vcpu; > + unsigned long i; > + > + mutex_lock(&kvm->lock); > + kvm_for_each_vcpu(i, vcpu, kvm) { > + vcpu->arch.st.host_suspended = 0; > + vcpu->arch.st.suspend_ns += ktime_get_boottime_ns() - > + vcpu->arch.st.last_suspend; > + wake_up_interruptible(&vcpu->arch.st.resume_waitq); > + } > + mutex_unlock(&kvm->lock); > + > + return NOTIFY_DONE; > +} > + > int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) > { > switch (state) { > case PM_HIBERNATION_PREPARE: > case PM_SUSPEND_PREPARE: > return kvm_arch_suspend_notifier(kvm); > + case PM_POST_HIBERNATION: > + case PM_POST_SUSPEND: > + return kvm_arch_resume_notifier(kvm); > } > > return NOTIFY_DONE; > @@ -10813,6 +10861,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > r = 1; > goto out; > } > + if (kvm_check_request(KVM_REQ_WAIT_FOR_RESUME, vcpu)) > + wait_for_resume(vcpu); > if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) > record_steal_time(vcpu); > if (kvm_check_request(KVM_REQ_PMU, vcpu)) > @@ -12341,6 +12391,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) > if (r) > goto free_guest_fpu; > > + init_waitqueue_head(&vcpu->arch.st.resume_waitq); > kvm_xen_init_vcpu(vcpu); > vcpu_load(vcpu); > kvm_vcpu_after_set_cpuid(vcpu); > -- > 2.48.1.601.g30ceb7b040-goog > >