Introduce new bit KVM_X86_DISABLE_EXITS_PER_VCPU and second arg of KVM_CAP_X86_DISABLE_EXITS cap as vCPU mask for disabling exits to enable finer-grained VM exits disabling on per vCPU scales instead of whole guest. This exits_disable_vcpu_mask default is 0, i.e. disable exits on all vCPUs, if it is 0x5, i.e. enable exits on vCPU0 and vCPU2, disable exits on all other vCPUs. This patch only enabled this per-vCPU disable on HLT VM-exits. In use cases like Windows guest running heavy CPU-bound workloads, disabling HLT VM-exits could mitigate host sched ctx switch overhead. Simply HLT disabling on all vCPUs could bring performance benefits, but if no pCPUs reserved for host threads, could happened to the forced preemption as host does not know the time to do the schedule for other host threads want to run. With this patch, we could only disable part of vCPUs HLT exits for one guest, this still keeps performance benefits, and also shows resiliency to host stressing workload running at the same time. In the host stressing workload experiment with Windows guest heavy CPU-bound workloads, it shows good resiliency and having the ~3% performance improvement. Signed-off-by: Kechen Lu <kechenl@xxxxxxxxxx> --- Documentation/virt/kvm/api.rst | 8 +++++++- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 5 ++++- arch/x86/kvm/x86.h | 5 +++-- include/uapi/linux/kvm.h | 4 +++- 8 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index aeeb071c7688..9a44896dc950 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6580,6 +6580,9 @@ branch to guests' 0x200 interrupt vector. :Architectures: x86 :Parameters: args[0] defines which exits are disabled + args[1] defines vCPU bitmask based on vCPU ID, 1 on + corresponding vCPU ID bit would enable exists + on that vCPU :Returns: 0 on success, -EINVAL when args[0] contains invalid exits Valid bits in args[0] are:: @@ -6588,13 +6591,16 @@ Valid bits in args[0] are:: #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) + #define KVM_X86_DISABLE_EXITS_PER_VCPU (1UL << 63) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some workloads, and is suggested when vCPUs are associated to dedicated physical CPUs. More bits can be added in the future; userspace can just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable -all such vmexits. +all such vmexits. Set KVM_X86_DISABLE_EXITS_PER_VCPU enables per-vCPU +exits disabling based on the vCPUs bitmask for args[1], currently only +set for HLT exits. Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2164b9f4c7b0..1c65dc500c55 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1085,6 +1085,7 @@ struct kvm_arch { bool hlt_in_guest; bool pause_in_guest; bool cstate_in_guest; + u64 exits_disable_vcpu_mask; unsigned long irq_sources_bitmap; s64 kvmclock_offset; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 07e9215e911d..6291e15710ba 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -177,7 +177,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); best = kvm_find_kvm_cpuid_features(vcpu); - if (kvm_hlt_in_guest(vcpu->kvm) && best && + if (kvm_hlt_in_guest(vcpu) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d0f68d11ec70..d24f67b33ae5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1276,7 +1276,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) + if (!kvm_hlt_in_guest(vcpu)) svm_set_intercept(svm, INTERCEPT_HLT); control->iopm_base_pa = __sme_set(iopm_base); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5aadad3e7367..8694279bb655 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1585,7 +1585,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) * then the instruction is already executing and RIP has already been * advanced. */ - if (kvm_hlt_in_guest(vcpu->kvm) && + if (kvm_hlt_in_guest(vcpu) && vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } @@ -4123,7 +4123,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) if (kvm_mwait_in_guest(vmx->vcpu.kvm)) exec_control &= ~(CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING); - if (kvm_hlt_in_guest(vmx->vcpu.kvm)) + if (kvm_hlt_in_guest(&vmx->vcpu)) exec_control &= ~CPU_BASED_HLT_EXITING; return exec_control; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0cf1082455df..9432d7c04a98 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5773,6 +5773,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.pause_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) kvm->arch.cstate_in_guest = true; + if ((cap->args[0] & KVM_X86_DISABLE_EXITS_PER_VCPU) && + cap->args[1]) + kvm->arch.exits_disable_vcpu_mask = cap->args[1]; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: @@ -12080,7 +12083,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) vcpu->arch.exception.pending)) return false; - if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) + if (kvm_hlt_in_guest(vcpu) && !kvm_can_deliver_async_pf(vcpu)) return false; /* diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4abcd8d9836d..449476e13206 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -377,9 +377,10 @@ static inline bool kvm_mwait_in_guest(struct kvm *kvm) return kvm->arch.mwait_in_guest; } -static inline bool kvm_hlt_in_guest(struct kvm *kvm) +static inline bool kvm_hlt_in_guest(struct kvm_vcpu *vcpu) { - return kvm->arch.hlt_in_guest; + return vcpu->kvm->arch.hlt_in_guest && (rol64(1UL, vcpu->vcpu_id) & + ~vcpu->kvm->arch.exits_disable_vcpu_mask); } static inline bool kvm_pause_in_guest(struct kvm *kvm) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 1daa45268de2..976eb16f7fc0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -798,10 +798,12 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) +#define KVM_X86_DISABLE_EXITS_PER_VCPU (1UL << 63) #define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ KVM_X86_DISABLE_EXITS_HLT | \ KVM_X86_DISABLE_EXITS_PAUSE | \ - KVM_X86_DISABLE_EXITS_CSTATE) + KVM_X86_DISABLE_EXITS_CSTATE| \ + KVM_X86_DISABLE_EXITS_PER_VCPU) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { -- 2.30.2