The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick" a VCPU out of KVM_RUN through a POSIX signal. A signal is attached to a dummy signal handler; by blocking the signal outside KVM_RUN and unblocking it inside, this possible race is closed: VCPU thread service thread -------------------------------------------------------------- check flag set flag raise signal (signal handler does nothing) KVM_RUN However, one issue with KVM_SET_SIGNAL_MASK is that it has to take tsk->sighand->siglock on every KVM_RUN. This lock is often on a remote NUMA node, because it is on the node of a thread's creator. Taking this lock can be very expensive if there are many userspace exits (as is the case for SMP Windows VMs without Hyper-V reference time counter). As an alternative, we can put the flag directly in kvm_run so that KVM can see it: VCPU thread service thread -------------------------------------------------------------- raise signal signal handler set run->immediate_exit KVM_RUN check run->immediate_exit Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> --- Documentation/virtual/kvm/api.txt | 13 ++++++++++++- include/uapi/linux/kvm.h | 1 + virt/kvm/kvm_main.c | 14 +++++++++++--- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e4f2cdcf78eb..925b1b6be073 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3389,7 +3389,18 @@ struct kvm_run { Request that KVM_RUN return when it becomes possible to inject external interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - __u8 padding1[7]; + __u8 immediate_exit; + +This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN +clears the field and exits immediately, returning -EINTR. In the common +scenario where a signal is used to "kick" a VCPU out of KVM_RUN, this +field can be used as an alternative to KVM_SET_SIGNAL_MASK: instead of +blocking the signal outside KVM_RUN, the signal handler should set +run->immediate_exit to a nonzero value. + +This field is only available if KVM_CAP_IMMEDIATE_EXIT is. + + __u8 padding1[6]; /* out */ __u32 exit_reason; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7964b970b9ad..0aa43b1e5b1d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -881,6 +881,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SPAPR_RESIZE_HPT 133 #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 +#define KVM_CAP_IMMEDIATE_EXIT 136 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 482612b4e496..5abb8df00db1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2550,7 +2550,8 @@ static long kvm_vcpu_ioctl(struct file *filp, if (r) return r; switch (ioctl) { - case KVM_RUN: + case KVM_RUN: { + struct kvm_run *run; r = -EINVAL; if (arg) goto out; @@ -2564,9 +2565,15 @@ static long kvm_vcpu_ioctl(struct file *filp, synchronize_rcu(); put_pid(oldpid); } - r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); - trace_kvm_userspace_exit(vcpu->run->exit_reason, r); + run = vcpu->run; + if (run->immediate_exit) { + WRITE_ONCE(run->immediate_exit, 0); + return -EINTR; + } + r = kvm_arch_vcpu_ioctl_run(vcpu, run); + trace_kvm_userspace_exit(run->exit_reason, r); break; + } case KVM_GET_REGS: { struct kvm_regs *kvm_regs; @@ -2927,6 +2934,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: + case KVM_CAP_IMMEDIATE_EXIT: return 1; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: -- 1.8.3.1