On Wed, Sep 26, 2018 at 07:02:58PM +0200, Vitaly Kuznetsov wrote: > VP inedx almost always matches VCPU and when it does it's faster to walk > the sparse set instead of all vcpus. > > Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> > --- > arch/x86/kvm/hyperv.c | 96 +++++++++++++++++++++++-------------------- > 1 file changed, 52 insertions(+), 44 deletions(-) > > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c > index eeb12eacd525..cc0535a078f7 100644 > --- a/arch/x86/kvm/hyperv.c > +++ b/arch/x86/kvm/hyperv.c > @@ -1277,32 +1277,37 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) > return kvm_hv_get_msr(vcpu, msr, pdata, host); > } > > -static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) > +static __always_inline bool hv_vcpu_in_sparse_set(struct kvm_vcpu_hv *hv_vcpu, > + u64 sparse_banks[], > + u64 valid_bank_mask) > { > - int i = 0, j; > + int bank = hv_vcpu->vp_index / 64, sbank; > > - if (!(valid_bank_mask & BIT_ULL(bank_no))) > - return -1; > + if (bank >= 64) > + return false; > > - for (j = 0; j < bank_no; j++) > - if (valid_bank_mask & BIT_ULL(j)) > - i++; > + if (!(valid_bank_mask & BIT_ULL(bank))) > + return false; > > - return i; > + /* Sparse bank number equals to the number of set bits before it */ > + sbank = bitmap_weight((unsigned long *)&valid_bank_mask, bank); > + > + return !!(sparse_banks[sbank] & BIT_ULL(hv_vcpu->vp_index % 64)); > } > > static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, > u16 rep_cnt, bool ex) > { > struct kvm *kvm = current_vcpu->kvm; > - struct kvm_vcpu_hv *hv_current = ¤t_vcpu->arch.hyperv; > + struct kvm_hv *hv = &kvm->arch.hyperv; > + struct kvm_vcpu_hv *hv_vcpu = ¤t_vcpu->arch.hyperv; > struct hv_tlb_flush_ex flush_ex; > struct hv_tlb_flush flush; > struct kvm_vcpu *vcpu; > unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0}; > - u64 valid_bank_mask = 0; > + u64 valid_bank_mask; > u64 sparse_banks[64]; > - int sparse_banks_len, i; > + int sparse_banks_len, i, bank, sbank; > bool all_cpus; > > if (!ex) { > @@ -1312,6 +1317,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, > trace_kvm_hv_flush_tlb(flush.processor_mask, > flush.address_space, flush.flags); > > + valid_bank_mask = BIT_ULL(0); > sparse_banks[0] = flush.processor_mask; > all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; > } else { > @@ -1344,52 +1350,54 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, > return HV_STATUS_INVALID_HYPERCALL_INPUT; > } > > - cpumask_clear(&hv_current->tlb_lush); > + /* > + * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't > + * analyze it here, flush TLB regardless of the specified address space. > + */ > + cpumask_clear(&hv_vcpu->tlb_lush); Maybe squash this hv_current -> hv_vcpu renaming into patch 3? (And yes this "lush" is funny, too ;) > > if (all_cpus) { > kvm_make_vcpus_request_mask(kvm, > KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, > - NULL, &hv_current->tlb_lush); > + NULL, &hv_vcpu->tlb_lush); > goto ret_success; > } > > - kvm_for_each_vcpu(i, vcpu, kvm) { > - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; > - int bank = hv->vp_index / 64, sbank = 0; > - > - /* Banks >64 can't be represented */ > - if (bank >= 64) > - continue; > - > - /* Non-ex hypercalls can only address first 64 vCPUs */ > - if (!ex && bank) > - continue; > - > - if (ex) { > - /* > - * Check is the bank of this vCPU is in sparse > - * set and get the sparse bank number. > - */ > - sbank = get_sparse_bank_no(valid_bank_mask, bank); > - > - if (sbank < 0) > - continue; > + if (atomic_read(&hv->num_mismatched_vp_indexes)) { > + kvm_for_each_vcpu(i, vcpu, kvm) { > + if (hv_vcpu_in_sparse_set(&vcpu->arch.hyperv, > + sparse_banks, > + valid_bank_mask)) > + __set_bit(i, vcpu_bitmap); > } > + goto flush_request; > + } > > - if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64))) > - continue; > - > - /* > - * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we > - * can't analyze it here, flush TLB regardless of the specified > - * address space. > - */ > - __set_bit(i, vcpu_bitmap); > + /* > + * num_mismatched_vp_indexes is zero so every vcpu has > + * vp_index == vcpu_idx. > + */ > + sbank = 0; > + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, > + BITS_PER_LONG) { s/BITS_PER_LONG/64/ > + for_each_set_bit(i, > + (unsigned long *)&sparse_banks[sbank], > + BITS_PER_LONG) { ditto > + u32 vp_index = bank * 64 + i; > + > + /* A non-existent vCPU was specified */ > + if (vp_index >= KVM_MAX_VCPUS) > + return HV_STATUS_INVALID_HYPERCALL_INPUT; > + > + __set_bit(vp_index, vcpu_bitmap); > + } > + sbank++; > } I wonder if copying the bank as a whole would make it easier to follow (and somewhat more efficient): sbank = 0; for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, 64) ((u64)vcpu_bitmap)[bank] = sparse_banks[sbank++]; Also it seems equally efficient but slightly easier to read if vcpu_bitmap is filled first regardless of ->num_mismatched_vp_indexes, and then either passed directly to kvm_make_vcpus_request_mask if num_mismatched_vp_indexes == 0, or converted into the real vcpu mask using the regular test_bit otherwise. So eventually it all would look like ... u64 vp_bitmap[KVM_MAX_VCPUS / 64] = {0}; DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS) = {0}; unsigned long *vcpu_mask; ... if (all_cpus) { vcpu_mask = NULL; goto flush_request; } sbank = 0; for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, 64) vp_bitmap[bank] = sparse_banks[sbank++]; if (likely(!atomic_read(&hv->num_mismatched_vp_indexes)) { /* for all vcpus vp_index == vcpu_idx */ vcpu_mask = vp_bitmap; goto flush_request; } kvm_for_each_vcpu(i, vcpu, kvm) if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, vp_bitmap)) __set_bit(i, vcpu_bitmap); vcpu_mask = vcpu_bitmap; ... > > +flush_request: > kvm_make_vcpus_request_mask(kvm, > KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, > - vcpu_bitmap, &hv_current->tlb_lush); > + vcpu_bitmap, &hv_vcpu->tlb_lush); > > ret_success: > /* We always do full TLB flush, set rep_done = rep_cnt. */ Roman.