On Tue, Oct 26, 2021, David Woodhouse wrote: > From: David Woodhouse <dwmw@xxxxxxxxxxxx> > > The Xen interrupt injection for event channels relies on accessing the > guest's vcpu_info structure in __kvm_xen_has_interrupt(), through a > gfn_to_hva_cache. > > This requires the srcu lock to be held, which is mostly the case except > for this code path: > > [ 11.822877] WARNING: suspicious RCU usage > [ 11.822965] ----------------------------- > [ 11.823013] include/linux/kvm_host.h:664 suspicious rcu_dereference_check() usage! > [ 11.823131] > [ 11.823131] other info that might help us debug this: > [ 11.823131] > [ 11.823196] > [ 11.823196] rcu_scheduler_active = 2, debug_locks = 1 > [ 11.823253] 1 lock held by dom:0/90: > [ 11.823292] #0: ffff998956ec8118 (&vcpu->mutex){+.+.}, at: kvm_vcpu_ioctl+0x85/0x680 > [ 11.823379] > [ 11.823379] stack backtrace: > [ 11.823428] CPU: 2 PID: 90 Comm: dom:0 Kdump: loaded Not tainted 5.4.34+ #5 > [ 11.823496] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 > [ 11.823612] Call Trace: > [ 11.823645] dump_stack+0x7a/0xa5 > [ 11.823681] lockdep_rcu_suspicious+0xc5/0x100 > [ 11.823726] __kvm_xen_has_interrupt+0x179/0x190 > [ 11.823773] kvm_cpu_has_extint+0x6d/0x90 > [ 11.823813] kvm_cpu_accept_dm_intr+0xd/0x40 > [ 11.823853] kvm_vcpu_ready_for_interrupt_injection+0x20/0x30 > < post_kvm_run_save() inlined here > > [ 11.823906] kvm_arch_vcpu_ioctl_run+0x135/0x6a0 > [ 11.823947] kvm_vcpu_ioctl+0x263/0x680 > > Fixes: 40da8ccd724f ("KVM: x86/xen: Add event channel interrupt vector upcall") > Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx> > --- > > There are potentially other ways of doing this, by shuffling the tail > of kvm_arch_vcpu_ioctl_run() around a little and holding the lock once > there instead of taking it within vcpu_run(). But the call to > post_kvm_run_save() occurs even on the error paths, and it gets complex > to untangle. This is the simple approach. What about taking the lock well early on so that the tail doesn't need to juggle errors? Dropping the lock for the KVM_MP_STATE_UNINITIALIZED case is a little unfortunate, but that at least pairs with similar logic in x86's other call to kvm_vcpu_block(). Relocking if xfer_to_guest_mode_handle_work() triggers an exit to userspace is also unfortunate but it's not the end of the world. On the plus side, the complete_userspace_io() callback doesn't need to worry about taking the lock. --- arch/x86/kvm/x86.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ac83d873d65b..90751a080447 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10039,7 +10039,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu) int r; struct kvm *kvm = vcpu->kvm; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vcpu->arch.l1tf_flush_l1d = true; for (;;) { @@ -10067,25 +10066,18 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (__xfer_to_guest_mode_work_pending()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); r = xfer_to_guest_mode_handle_work(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (r) return r; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - return r; } static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { - int r; - - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - return r; + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -10224,12 +10216,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { r = -EINTR; goto out; } + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (kvm_apic_accept_events(vcpu) < 0) { r = 0; goto out; @@ -10279,10 +10275,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = vcpu_run(vcpu); out: - kvm_put_guest_fpu(vcpu); if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); + + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + + kvm_put_guest_fpu(vcpu); kvm_sigset_deactivate(vcpu); vcpu_put(vcpu); --