On Mon, 2022-06-06 at 10:36 +0200, Vitaly Kuznetsov wrote: > Handle L2 TLB flush requests by going through all vCPUs and checking > whether there are vCPUs running the same VM_ID with a VP_ID specified > in the requests. Perform synthetic exit to L2 upon finish. > > Note, while checking VM_ID/VP_ID of running vCPUs seem to be a bit > racy, we count on the fact that KVM flushes the whole L2 VPID upon > transition. Also, KVM_REQ_HV_TLB_FLUSH request needs to be done upon > transition between L1 and L2 to make sure all pending requests are > always processed. > > For the reference, Hyper-V TLFS refers to the feature as "Direct > Virtual Flush". > > Note, nVMX/nSVM code does not handle VMCALL/VMMCALL from L2 yet. > > Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> > --- > arch/x86/kvm/hyperv.c | 84 +++++++++++++++++++++++++++++++++++-------- > arch/x86/kvm/hyperv.h | 14 ++++---- > arch/x86/kvm/trace.h | 21 ++++++----- > arch/x86/kvm/x86.c | 4 +-- > 4 files changed, 91 insertions(+), 32 deletions(-) > > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c > index 3075e9661696..740190917c1c 100644 > --- a/arch/x86/kvm/hyperv.c > +++ b/arch/x86/kvm/hyperv.c > @@ -34,6 +34,7 @@ > #include <linux/eventfd.h> > > #include <asm/apicdef.h> > +#include <asm/mshyperv.h> > #include <trace/events/kvm.h> > > #include "trace.h" > @@ -1835,9 +1836,10 @@ static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc > entries, consumed_xmm_halves, offset); > } > > -static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, u64 *entries, int count) > +static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, > + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, > + u64 *entries, int count) > { > - struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; > struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); > u64 entry = KVM_HV_TLB_FLUSHALL_ENTRY; > unsigned long flags; > @@ -1845,9 +1847,6 @@ static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, u64 *entries, int count) > if (!hv_vcpu) > return; > > - /* kvm_hv_flush_tlb() is not ready to handle requests for L2s yet */ > - tlb_flush_fifo = &hv_vcpu->tlb_flush_fifo[HV_L1_TLB_FLUSH_FIFO]; > - > spin_lock_irqsave(&tlb_flush_fifo->write_lock, flags); > > /* > @@ -1883,7 +1882,7 @@ void kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) > return; > } > > - tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu); > + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); > > count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); > > @@ -1916,6 +1915,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) > struct hv_tlb_flush_ex flush_ex; > struct hv_tlb_flush flush; > DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); > + struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; > /* > * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' > * entries on the TLB flush fifo. The last entry, however, needs to be > @@ -1959,7 +1959,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) > } > > trace_kvm_hv_flush_tlb(flush.processor_mask, > - flush.address_space, flush.flags); > + flush.address_space, flush.flags, > + is_guest_mode(vcpu)); > > valid_bank_mask = BIT_ULL(0); > sparse_banks[0] = flush.processor_mask; > @@ -1990,7 +1991,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) > trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, > flush_ex.hv_vp_set.format, > flush_ex.address_space, > - flush_ex.flags); > + flush_ex.flags, is_guest_mode(vcpu)); > > valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; > all_cpus = flush_ex.hv_vp_set.format != > @@ -2028,19 +2029,57 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) > * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't > * analyze it here, flush TLB regardless of the specified address space. > */ > - if (all_cpus) { > - kvm_for_each_vcpu(i, v, kvm) > - hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt); > + if (all_cpus && !is_guest_mode(vcpu)) { > + kvm_for_each_vcpu(i, v, kvm) { > + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); > + hv_tlb_flush_enqueue(v, tlb_flush_fifo, > + tlb_flush_entries, hc->rep_cnt); > + } > > kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); > - } else { > + } else if (!is_guest_mode(vcpu)) { > sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); > > for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { > v = kvm_get_vcpu(kvm, i); > if (!v) > continue; > - hv_tlb_flush_enqueue(v, tlb_flush_entries, hc->rep_cnt); > + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); > + hv_tlb_flush_enqueue(v, tlb_flush_fifo, > + tlb_flush_entries, hc->rep_cnt); > + } > + > + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); > + } else { > + struct kvm_vcpu_hv *hv_v; > + > + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); > + > + kvm_for_each_vcpu(i, v, kvm) { > + hv_v = to_hv_vcpu(v); > + > + /* > + * The following check races with nested vCPUs entering/exiting > + * and/or migrating between L1's vCPUs, however the only case when > + * KVM *must* flush the TLB is when the target L2 vCPU keeps > + * running on the same L1 vCPU from the moment of the request until > + * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other > + * cases, e.g. when the target L2 vCPU migrates to a different L1 > + * vCPU or when the corresponding L1 vCPU temporary switches to a > + * different L2 vCPU while the request is being processed. Looks great! > + */ > + if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) > + continue; > + > + if (!all_cpus && > + !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, > + sparse_banks)) > + continue; > + > + __set_bit(i, vcpu_mask); > + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); > + hv_tlb_flush_enqueue(v, tlb_flush_fifo, > + tlb_flush_entries, hc->rep_cnt); > } > > kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); > @@ -2228,10 +2267,27 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) > > static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) > { > + int ret; > + > trace_kvm_hv_hypercall_done(result); > kvm_hv_hypercall_set_result(vcpu, result); > ++vcpu->stat.hypercalls; > - return kvm_skip_emulated_instruction(vcpu); > + ret = kvm_skip_emulated_instruction(vcpu); > + > + if (unlikely(hv_result_success(result) && is_guest_mode(vcpu) > + && kvm_hv_is_tlb_flush_hcall(vcpu))) { > + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); > + u32 tlb_lock_count; > + > + if (unlikely(kvm_read_guest(vcpu->kvm, hv_vcpu->nested.pa_page_gpa, > + &tlb_lock_count, sizeof(tlb_lock_count)))) > + kvm_inject_gp(vcpu, 0); > + > + if (tlb_lock_count) > + kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); > + } > + > + return ret; > } > > static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) > diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h > index dc46c5ed5d18..7778b3a5913c 100644 > --- a/arch/x86/kvm/hyperv.h > +++ b/arch/x86/kvm/hyperv.h > @@ -148,26 +148,24 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); > int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, > struct kvm_cpuid_entry2 __user *entries); > > -static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu) > +static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu, > + bool is_guest_mode) > { > struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); > - int i = !is_guest_mode(vcpu) ? HV_L1_TLB_FLUSH_FIFO : > - HV_L2_TLB_FLUSH_FIFO; > - > - /* KVM does not handle L2 TLB flush requests yet */ > - WARN_ON_ONCE(i != HV_L1_TLB_FLUSH_FIFO); > + int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO : > + HV_L1_TLB_FLUSH_FIFO; > > return &hv_vcpu->tlb_flush_fifo[i]; > } > > -static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu) > +static inline void kvm_hv_vcpu_empty_flush_tlb(struct kvm_vcpu *vcpu, bool is_guest_mode) > { > struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; > > if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) > return; > > - tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu); > + tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode); > > kfifo_reset_out(&tlb_flush_fifo->entries); > } > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h > index fd28dd40b813..f5e5b8f0342c 100644 > --- a/arch/x86/kvm/trace.h > +++ b/arch/x86/kvm/trace.h > @@ -1510,38 +1510,41 @@ TRACE_EVENT(kvm_hv_timer_state, > * Tracepoint for kvm_hv_flush_tlb. > */ > TRACE_EVENT(kvm_hv_flush_tlb, > - TP_PROTO(u64 processor_mask, u64 address_space, u64 flags), > - TP_ARGS(processor_mask, address_space, flags), > + TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode), > + TP_ARGS(processor_mask, address_space, flags, guest_mode), > > TP_STRUCT__entry( > __field(u64, processor_mask) > __field(u64, address_space) > __field(u64, flags) > + __field(bool, guest_mode) > ), > > TP_fast_assign( > __entry->processor_mask = processor_mask; > __entry->address_space = address_space; > __entry->flags = flags; > + __entry->guest_mode = guest_mode; > ), > > - TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx", > + TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s", > __entry->processor_mask, __entry->address_space, > - __entry->flags) > + __entry->flags, __entry->guest_mode ? "(L2)" : "") > ); > > /* > * Tracepoint for kvm_hv_flush_tlb_ex. > */ > TRACE_EVENT(kvm_hv_flush_tlb_ex, > - TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags), > - TP_ARGS(valid_bank_mask, format, address_space, flags), > + TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode), > + TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode), > > TP_STRUCT__entry( > __field(u64, valid_bank_mask) > __field(u64, format) > __field(u64, address_space) > __field(u64, flags) > + __field(bool, guest_mode) > ), > > TP_fast_assign( > @@ -1549,12 +1552,14 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex, > __entry->format = format; > __entry->address_space = address_space; > __entry->flags = flags; > + __entry->guest_mode = guest_mode; > ), > > TP_printk("valid_bank_mask 0x%llx format 0x%llx " > - "address_space 0x%llx flags 0x%llx", > + "address_space 0x%llx flags 0x%llx %s", > __entry->valid_bank_mask, __entry->format, > - __entry->address_space, __entry->flags) > + __entry->address_space, __entry->flags, > + __entry->guest_mode ? "(L2)" : "") > ); > > /* > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 805db43c2829..8e945500ef50 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3355,12 +3355,12 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) > { > if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) { > kvm_vcpu_flush_tlb_current(vcpu); > - kvm_hv_vcpu_empty_flush_tlb(vcpu); > + kvm_hv_vcpu_empty_flush_tlb(vcpu, is_guest_mode(vcpu)); > } > > if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) { > kvm_vcpu_flush_tlb_guest(vcpu); > - kvm_hv_vcpu_empty_flush_tlb(vcpu); > + kvm_hv_vcpu_empty_flush_tlb(vcpu, is_guest_mode(vcpu)); > } else if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) { > kvm_hv_vcpu_flush_tlb(vcpu); > } Looks good, Reviewed-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx> Best regadrds, Maxim Levitsky