On 10.10.24 10:57, Vitaly Kuznetsov wrote: > Nikolas Wipper <nikwip@xxxxxxxxx> writes: >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 7571ac578884..ab3a9beb61a2 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -698,6 +698,8 @@ struct kvm_vcpu_hv { >> >> bool suspended; >> int waiting_on; >> + >> + int tlb_flush_inhibit; > > This is basically boolean, right? And we only make it 'int' to be able > to store 'u8' from the ioctl? This doesn't look very clean. Do you > envision anything but '1'/'0' in 'inhibit'? If not, maybe we can just > make it a flag (and e.g. extend 'flags' to be u32/u64)? This way we can > convert 'tlb_flush_inhibit' to a normal bool. > Yes, inhibit would always be binary, so incorporating it into the flags sounds reasonable. Even with the current API, this could just be a bool (tlb_flush_inhibit = inhibit == 1;) >> }; >> >> struct kvm_hypervisor_cpuid { >> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c >> index e68fbc0c7fc1..40ea8340838f 100644 >> --- a/arch/x86/kvm/hyperv.c >> +++ b/arch/x86/kvm/hyperv.c >> @@ -2137,6 +2137,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) >> bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); >> >> kvm_for_each_vcpu(i, v, kvm) { >> + if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit)) >> + goto ret_suspend; >> + >> __set_bit(i, vcpu_mask); >> } >> } else if (!is_guest_mode(vcpu)) { >> @@ -2148,6 +2151,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) >> __clear_bit(i, vcpu_mask); >> continue; >> } >> + >> + if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit)) >> + goto ret_suspend; >> } >> } else { >> struct kvm_vcpu_hv *hv_v; >> @@ -2175,6 +2181,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) >> sparse_banks)) >> continue; >> >> + if (READ_ONCE(v->arch.hyperv->tlb_flush_inhibit)) >> + goto ret_suspend; >> + > > These READ_ONCEs make me think I misunderstand something here, please > bear with me :-). > > Like we're trying to protect against 'tlb_flush_inhibit' being read > somewhere in the beginning of the function and want to generate real > memory accesses. But what happens if tlb_flush_inhibit changes right > _after_ we checked it here and _before_ we actuall do > kvm_make_vcpus_request_mask()? Wouldn't it be a problem? In case it > would, I think we need to reverse the order: do > kvm_make_vcpus_request_mask() anyway and after it go through vcpu_mask > checking whether any of the affected vCPUs has 'tlb_flush_inhibit' and > if it does, suspend the caller. > The case you're describing is prevented through SRCU synchronisation in the ioctl. The hypercall actually holds a read side critical section during the whole of its execution, so when tlb_flush_inhibit changes after we read it, the ioctl would wait for the flushes to complete: vCPU 0 | vCPU 1 -------------------------+------------------------ | hypercall enter | srcu_read_lock() ioctl enter | | tlb_flush_inhibit read tlb_flush_inhibit write | synchronize_srcu() start | | TLB flush reqs send | srcu_read_unlock() synchronize_srcu() end | ioctl exit | >> __set_bit(i, vcpu_mask); >> } >> } >> @@ -2193,6 +2202,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) >> /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ >> return (u64)HV_STATUS_SUCCESS | >> ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); >> +ret_suspend: >> + kvm_hv_vcpu_suspend_tlb_flush(vcpu, v->vcpu_id); >> + return -EBUSY; >> } >> >> static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, >> @@ -2380,6 +2392,13 @@ static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) >> u32 tlb_lock_count = 0; >> int ret; >> >> + /* >> + * Reached when the hyper-call resulted in a suspension of the vCPU. >> + * The instruction will be re-tried once the vCPU is unsuspended. >> + */ >> + if (kvm_hv_vcpu_suspended(vcpu)) >> + return 1; >> + >> if (hv_result_success(result) && is_guest_mode(vcpu) && >> kvm_hv_is_tlb_flush_hcall(vcpu) && >> kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, >> @@ -2919,6 +2938,9 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, >> >> void kvm_hv_vcpu_suspend_tlb_flush(struct kvm_vcpu *vcpu, int vcpu_id) >> { >> + RCU_LOCKDEP_WARN(!srcu_read_lock_held(&vcpu->kvm->srcu), >> + "Suspicious Hyper-V TLB flush inhibit usage\n"); >> + >> /* waiting_on's store should happen before suspended's */ >> WRITE_ONCE(vcpu->arch.hyperv->waiting_on, vcpu_id); >> WRITE_ONCE(vcpu->arch.hyperv->suspended, true); >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 18d0a300e79a..1f925e32a927 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -4642,6 +4642,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) >> case KVM_CAP_HYPERV_CPUID: >> case KVM_CAP_HYPERV_ENFORCE_CPUID: >> case KVM_CAP_SYS_HYPERV_CPUID: >> + case KVM_CAP_HYPERV_TLB_FLUSH_INHIBIT: >> #endif >> case KVM_CAP_PCI_SEGMENT: >> case KVM_CAP_DEBUGREGS: >> @@ -5853,6 +5854,31 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, >> } >> } >> >> +static int kvm_vcpu_ioctl_set_tlb_flush_inhibit(struct kvm_vcpu *vcpu, >> + struct kvm_hyperv_tlb_flush_inhibit *set) >> +{ >> + if (set->inhibit == READ_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit)) >> + return 0; >> + >> + WRITE_ONCE(vcpu->arch.hyperv->tlb_flush_inhibit, set->inhibit); > > As you say before, vCPU ioctls are serialized and noone else sets > tlb_flush_inhibit, do I understand correctly that > READ_ONCE()/WRITE_ONCE() are redundant here? > As mentioned before, since tlb_flush_inhibit is shared it needs these calls. Nikolas