On Sat, Sep 30, 2023, David Woodhouse wrote: > diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c > index 40edf4d1974c..75586da134b3 100644 > --- a/arch/x86/kvm/xen.c > +++ b/arch/x86/kvm/xen.c > @@ -134,9 +134,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) > { > struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, > arch.xen.timer); > + struct kvm_xen_evtchn e; > + int rc; > + > if (atomic_read(&vcpu->arch.xen.timer_pending)) > return HRTIMER_NORESTART; > > + e.vcpu_id = vcpu->vcpu_id; > + e.vcpu_idx = vcpu->vcpu_idx; > + e.port = vcpu->arch.xen.timer_virq; > + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; > + > + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm); > + if (rc != -EWOULDBLOCK) { > + vcpu->arch.xen.timer_expires = 0; > + return HRTIMER_NORESTART; > + } > + > atomic_inc(&vcpu->arch.xen.timer_pending); > kvm_make_request(KVM_REQ_UNBLOCK, vcpu); > kvm_vcpu_kick(vcpu); > @@ -146,6 +160,14 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) > > static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) > { > + /* > + * Avoid races with the old timer firing. Checking timer_expires > + * to avoid calling hrtimer_cancel() will only have false positives > + * so is fine. > + */ > + if (vcpu->arch.xen.timer_expires) > + hrtimer_cancel(&vcpu->arch.xen.timer); > + > atomic_set(&vcpu->arch.xen.timer_pending, 0); > vcpu->arch.xen.timer_expires = guest_abs; > > @@ -1019,9 +1041,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) > break; > > case KVM_XEN_VCPU_ATTR_TYPE_TIMER: > + /* > + * Ensure a consistent snapshot of state is captured, with a > + * timer either being pending, or the event channel delivered > + * to the corresponding bit in the shared_info. Not still > + * lurking in the timer_pending flag for deferred delivery. > + * Purely as an optimisation, if the timer_expires field is > + * zero, that means the timer isn't active (or even in the > + * timer_pending flag) and there is no need to cancel it. > + */ > + if (vcpu->arch.xen.timer_expires) { > + hrtimer_cancel(&vcpu->arch.xen.timer); > + kvm_xen_inject_timer_irqs(vcpu); This has an obvious-in-hindsight recursive deadlock bug. If KVM actually needs to inject a timer IRQ, and the fast path fails, i.e. the gpc is invalid, kvm_xen_set_evtchn() will attempt to acquire xen.xen_lock, which is already held. Not sure if I sucked at testing before, or if I just got "lucky" on a random run. ============================================ WARNING: possible recursive locking detected 6.8.0-smp--5e10b4d51d77-drs #232 Tainted: G O -------------------------------------------- xen_shinfo_test/250013 is trying to acquire lock: ffff938c9930cc30 (&kvm->arch.xen.xen_lock){+.+.}-{3:3}, at: kvm_xen_set_evtchn+0x74/0x170 [kvm] but task is already holding lock: ffff938c9930cc30 (&kvm->arch.xen.xen_lock){+.+.}-{3:3}, at: kvm_xen_vcpu_get_attr+0x38/0x250 [kvm] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&kvm->arch.xen.xen_lock); lock(&kvm->arch.xen.xen_lock); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by xen_shinfo_test/250013: #0: ffff9228863f21a8 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x8f/0x5b0 [kvm] #1: ffff938c9930cc30 (&kvm->arch.xen.xen_lock){+.+.}-{3:3}, at: kvm_xen_vcpu_get_attr+0x38/0x250 [kvm] stack backtrace: CPU: 128 PID: 250013 Comm: xen_shinfo_test Tainted: G O 6.8.0-smp--5e10b4d51d77-drs #232 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 34.30.0 07/27/2023 Call Trace: <TASK> dump_stack_lvl+0x69/0xa0 dump_stack+0x14/0x20 print_deadlock_bug+0x2af/0x2c0 __lock_acquire+0x13f7/0x2e30 lock_acquire+0xd4/0x220 __mutex_lock+0x6a/0xa60 mutex_lock_nested+0x1f/0x30 kvm_xen_set_evtchn+0x74/0x170 [kvm] kvm_xen_vcpu_get_attr+0x136/0x250 [kvm] kvm_arch_vcpu_ioctl+0x942/0x1130 [kvm] kvm_vcpu_ioctl+0x484/0x5b0 [kvm] __se_sys_ioctl+0x7a/0xc0 __x64_sys_ioctl+0x21/0x30 do_syscall_64+0x82/0x160 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x460eab