Re: [PATCH 2/7] KVM: x86: Implement Hyper-V's vCPU suspended state

Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> · Thu, 10 Oct 2024 10:57:43 +0200

Nikolas Wipper <nikwip@xxxxxxxxx> writes:

> Introduce a suspension state for Hyper-V enlightened vCPUs. Microsoft's
> "Hypervisor Top Level Functional Specification" (TLFS) introduces this
> state as a "vCPU that is stopped on a instruction guest boundary, either
> explicitly or implicitly due to an intercept". The only documented
> explicit suspension is caused in response to a TLB Flush hypercall, which
> will use the state switching API in subsequent patches.
>
> Each Hyper-V vCPU now has a 'suspended' boolean field, which is checked
> before entering the guest. When set, it forces the vCPU to block. Once in
> that state, the vCPU ignores any events. The vCPU is unsuspended by
> clearing 'suspend' and issuing a request to force the vCPU out of sleep.
>
> Suspensions are issued as a mechanism to halt execution until state change
> is observed on a remote vCPU. Hyper-V vCPUs track this with 'waiting_on',
> which holds the 'vcpu_id' of the remote vCPU that forced the vCPU to enter
> the suspended state. It's the remote vCPU's responsibility to wake up the
> suspended vCPUs when ready. 'waiting_on' ensures the remote vCPU can
> selectively unsuspend vCPUs that blocked on its behalf while leaving other
> suspended vCPUs undisturbed. One vCPU can only be suspended due to a
> single remote vCPU, but different vCPUs can be suspended on behalf of
> different or the same remote vCPU(s). The guest is responsible for
> avoiding circular dependencies between suspended vCPUs.
>
> Callers of the suspend API are responsible for ensuring that suspend and
> unsuspend aren't called in parallel while targeting the same pair of
> vCPUs. Otherwise kvm_hv_vcpu_{un}suspend_tlb_flush() ensure 'waiting_on'
> and 'suspended' are updated and accessed in the correct order. This, for
> example, avoids races where the unsuspended vCPU re-suspends before
> kvm_hv_vcpu_unsuspend_tlb_flush() is done updating 'waiting_on'.
>
> Signed-off-by: Nikolas Wipper <nikwip@xxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  3 +++
>  arch/x86/kvm/hyperv.c           | 30 ++++++++++++++++++++++++++++++
>  arch/x86/kvm/hyperv.h           | 17 +++++++++++++++++
>  arch/x86/kvm/x86.c              |  4 +++-
>  4 files changed, 53 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 46e0a466d7fb..7571ac578884 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -695,6 +695,9 @@ struct kvm_vcpu_hv {
>  		u64 vm_id;
>  		u32 vp_id;
>  	} nested;
> +
> +	bool suspended;
> +	int waiting_on;

I don't quite understand why we need 'suspended' at all. Isn't it always
suspended when 'waiting_on != -1'? I can see we always update these two
in pair.

Also, I would suggest we use a more descriptive
name. 'waiting_on_vcpu_id', for example.

>  };
>  
>  struct kvm_hypervisor_cpuid {
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 4f0a94346d00..6e7941ed25ae 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -971,6 +971,7 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
>  
>  	vcpu->arch.hyperv = hv_vcpu;
>  	hv_vcpu->vcpu = vcpu;
> +	hv_vcpu->waiting_on = -1;
>  
>  	synic_init(&hv_vcpu->synic);
>  
> @@ -2915,3 +2916,32 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
>  
>  	return 0;
>  }
> +
> +void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id)

Can we make parameter's name 'waiting_on_vcpu_id' as well? Because as-is
I'm getting confused which CPU of these two is actually getting
suspended)

Also, why do we need '_tlb_flush' in the name? The mechanism seems to be
fairly generic, it's just that we use it for TLB flushes.

> +{
> +	/* waiting_on's store should happen before suspended's */
> +	WRITE_ONCE(vcpu->arch.hyperv->waiting_on, vcpu_id);
> +	WRITE_ONCE(vcpu->arch.hyperv->suspended, true);
> +}
> +
> +void kvm_hv_vcpu_unsuspend_tlb_flush(struct kvm_vcpu *vcpu)

And here someone may expect this means 'unsuspend vcpu' but in reality
this means 'unsuspend all vCPUs which are waiting on 'vcpu'). I guess we
need a rename. How about

void kvm_hv_unsuspend_vcpus(struct kvm_vcpu *waiting_on_vcpu)

?

> +{
> +	DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
> +	struct kvm_vcpu_hv *vcpu_hv;
> +	struct kvm_vcpu *v;
> +	unsigned long i;
> +
> +	kvm_for_each_vcpu(i, v, vcpu->kvm) {
> +		vcpu_hv = to_hv_vcpu(v);
> +
> +		if (kvm_hv_vcpu_suspended(v) &&
> +		    READ_ONCE(vcpu_hv->waiting_on) == vcpu->vcpu_id) {
> +			/* waiting_on's store should happen before suspended's */
> +			WRITE_ONCE(v->arch.hyperv->waiting_on, -1);
> +			WRITE_ONCE(v->arch.hyperv->suspended, false);
> +			__set_bit(i, vcpu_mask);
> +		}
> +	}
> +
> +	kvm_make_vcpus_request_mask(vcpu->kvm, KVM_REQ_EVENT, vcpu_mask);
> +}
> diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
> index 913bfc96959c..a55832cea221 100644
> --- a/arch/x86/kvm/hyperv.h
> +++ b/arch/x86/kvm/hyperv.h
> @@ -265,6 +265,15 @@ static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu,
>  }
>  
>  int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
> +
> +static inline bool kvm_hv_vcpu_suspended(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.hyperv_enabled &&
> +	       READ_ONCE(vcpu->arch.hyperv->suspended);

I don't think READ_ONCE() means anything here, does it?

> +}
> +
> +void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id);
> +void kvm_hv_vcpu_unsuspend_tlb_flush(struct kvm_vcpu *vcpu);
>  #else /* CONFIG_KVM_HYPERV */
>  static inline void kvm_hv_setup_tsc_page(struct kvm *kvm,
>  					 struct pvclock_vcpu_time_info *hv_clock) {}
> @@ -321,6 +330,14 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
>  	return vcpu->vcpu_idx;
>  }
>  static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, bool tdp_enabled) {}
> +
> +static inline bool kvm_hv_vcpu_suspended(struct kvm_vcpu *vcpu)
> +{
> +	return false;
> +}
> +
> +static inline void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id) {}
> +static inline void kvm_hv_vcpu_unsuspend_tlb_flush(struct kvm_vcpu *vcpu) {}
>  #endif /* CONFIG_KVM_HYPERV */
>  
>  #endif /* __ARCH_X86_KVM_HYPERV_H__ */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 15080385b8fe..18d0a300e79a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -135,6 +135,8 @@ static void store_regs(struct kvm_vcpu *vcpu);
>  static int sync_regs(struct kvm_vcpu *vcpu);
>  static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
>  
> +static inline bool kvm_hv_vcpu_suspended(struct kvm_vcpu *vcpu);
> +
>  static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
>  static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
>  
> @@ -11107,7 +11109,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
>  {
>  	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
> -		!vcpu->arch.apf.halted);
> +		!vcpu->arch.apf.halted && !kvm_hv_vcpu_suspended(vcpu));
>  }
>  
>  static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)

-- 
Vitaly