On Thu, 14 Nov 2019 14:15:16 -0600 Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx> wrote: > AMD SVM AVIC accelerates EOI write and does not trap. This causes > in-kernel PIT re-injection mode to fail since it relies on irq-ack > notifier mechanism. So, APICv is activated only when in-kernel PIT > is in discard mode e.g. w/ qemu option: > > -global kvm-pit.lost_tick_policy=discard > > Also, introduce APICV_INHIBIT_REASON_PIT_REINJ bit to be used for this > reason. > > Suggested-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx> > --- Hi, I've bisected https://bugzilla.kernel.org/show_bug.cgi?id=206579 (a kernel NULL pointer deref when using device assigned on AMD platforms) to this commit, e2ed4078a6ef3ddf4063329298852e24c36d46c8. My VM is a very basic libvirt managed domain with an assigned NIC, I don't even have an OS installed: /usr/bin/qemu-system-x86_64 \ -name guest=fedora31,debug-threads=on \ -S \ -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-1-fedora31/master-key.aes \ -machine pc-q35-3.1,accel=kvm,usb=off,vmport=off,dump-guest-core=off \ -cpu EPYC-IBPB,x2apic=on,tsc-deadline=on,hypervisor=on,tsc_adjust=on,xsaves=on,cmp_legacy=on,perfctr_core=on,virt-ssbd=on,monitor=off \ -m 8192 \ -realtime mlock=off \ -smp 8,sockets=8,cores=1,threads=1 \ -uuid a9639aa6-b3c1-4b45-b07b-80e0ad6d7df2 \ -no-user-config \ -nodefaults \ -chardev socket,id=charmonitor,fd=30,server,nowait \ -mon chardev=charmonitor,id=monitor,mode=control \ -rtc base=utc,driftfix=slew \ -global kvm-pit.lost_tick_policy=delay \ -no-hpet \ -no-shutdown \ -global ICH9-LPC.disable_s3=1 \ -global ICH9-LPC.disable_s4=1 \ -boot strict=on \ -device pcie-root-port,port=0x10,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x2 \ -device pcie-root-port,port=0x11,chassis=2,id=pci.2,bus=pcie.0,addr=0x2.0x1 \ -device pcie-root-port,port=0x12,chassis=3,id=pci.3,bus=pcie.0,addr=0x2.0x2 \ -device pcie-root-port,port=0x13,chassis=4,id=pci.4,bus=pcie.0,addr=0x2.0x3 \ -device pcie-root-port,port=0x14,chassis=5,id=pci.5,bus=pcie.0,addr=0x2.0x4 \ -device pcie-root-port,port=0x15,chassis=6,id=pci.6,bus=pcie.0,addr=0x2.0x5 \ -device pcie-root-port,port=0x16,chassis=7,id=pci.7,bus=pcie.0,addr=0x2.0x6 \ -device qemu-xhci,p2=15,p3=15,id=usb,bus=pci.2,addr=0x0 \ -drive file=/var/lib/libvirt/images/fedora31.qcow2,format=qcow2,if=none,id=drive-virtio-disk0 \ -device virtio-blk-pci,scsi=off,bus=pci.3,addr=0x0,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 \ -netdev tap,fd=32,id=hostnet0,vhost=on,vhostfd=33 \ -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:c4:c4:fb,bus=pci.1,addr=0x0 \ -vnc 127.0.0.1:0 \ -device VGA,id=video0,vgamem_mb=16,bus=pcie.0,addr=0x1 \ -device vfio-pci,host=01:00.0,id=hostdev0,bus=pci.4,addr=0x0 \ -device virtio-balloon-pci,id=balloon0,bus=pci.5,addr=0x0 \ -object rng-random,id=objrng0,filename=/dev/urandom \ -device virtio-rng-pci,rng=objrng0,id=rng0,bus=pci.6,addr=0x0 \ -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \ -msg timestamp=on This results in: BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 54 PID: 31469 Comm: CPU 0/KVM Not tainted 5.5.0+ #24 Hardware name: AMD Corporation Diesel/Diesel, BIOS RDL100BB 11/14/2018 RIP: 0010:svm_refresh_apicv_exec_ctrl+0xe4/0x110 [kvm_amd] Code: 8b 83 b8 39 00 00 48 39 c5 74 31 48 8b 9b b8 39 00 00 48 39 dd 75 13 eb 23 e8 c8 0d 97 d6 85 c0 75 1a 48 8b 1b 48 39 dd 74 12 <48> 8b 7b 10 45 85 e4 75 e6 e8 1e 0d 97 d6 85 c0 74 e6 5b 4c 89 ee RSP: 0018:ffff99ae87923d70 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8d2323b0a000 RDX: 0000000000000001 RSI: ffff8d232e76c600 RDI: ffff8d232c9bf398 RBP: ffff8d232c9bf388 R08: 0000000000000000 R09: ffff8d232e76c600 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000202 R14: ffff8d232c9bf398 R15: ffff99ae86e361a0 FS: 00007f2aa3d7d700(0000) GS:ffff8d232fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 000000046c716000 CR4: 00000000003406e0 Call Trace: kvm_arch_vcpu_ioctl_run+0x335/0x1a90 [kvm] ? do_futex+0x86b/0xca0 ? __seccomp_filter+0x7b/0x670 kvm_vcpu_ioctl+0x218/0x5c0 [kvm] ksys_ioctl+0x87/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1b0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f2aaa9c6fcb Code: 0f 1e fa 48 8b 05 bd ce 0c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8d ce 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007f2aa3d7c688 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00005623b5362220 RCX: 00007f2aaa9c6fcb RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000000f RBP: 00007f2aabc65000 R08: 00005623b4316f30 R09: 0000000000000000 R10: 00005623b52f2280 R11: 0000000000000246 R12: 00005623b5382e70 R13: 00005623b5362220 R14: 00005623b478a7c0 R15: 00007f2aa3d7c880 Modules linked in: kvm_amd ccp kvm vhost_net vhost macvtap macvlan tap vfio_pci vfio_virqfd vfio_iommu_type1 vfio irqbypass xt_CHECKSUM xt_MASQUERADE xt_conntrack tun bridge stp llc ip6table_mangle ip6table_nat iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c ebtable_filter ebtables ip6table_filter ip6_tables rfkill rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod sunrpc ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_umad vfat fat rdma_cm ib_cm iw_cm amd64_edac_mod edac_mce_amd i40iw ipmi_ssif ib_uverbs crct10dif_pclmul crc32_pclmul ib_core ghash_clmulni_intel pcspkr joydev sp5100_tco ipmi_si k10temp i2c_piix4 ipmi_devintf ipmi_msghandler acpi_cpufreq nouveau ast video drm_vram_helper mxm_wmi wmi drm_ttm_helper i2c_algo_bit drm_kms_helper cec ttm drm i40e e1000e crc32c_intel nvme nvme_core pinctrl_amd [last unloaded: ccp] CR2: 0000000000000010 ---[ end trace 5d826c21656a44f3 ]--- RIP: 0010:svm_refresh_apicv_exec_ctrl+0xe4/0x110 [kvm_amd] Code: 8b 83 b8 39 00 00 48 39 c5 74 31 48 8b 9b b8 39 00 00 48 39 dd 75 13 eb 23 e8 c8 0d 97 d6 85 c0 75 1a 48 8b 1b 48 39 dd 74 12 <48> 8b 7b 10 45 85 e4 75 e6 e8 1e 0d 97 d6 85 c0 74 e6 5b 4c 89 ee RSP: 0018:ffff99ae87923d70 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8d2323b0a000 RDX: 0000000000000001 RSI: ffff8d232e76c600 RDI: ffff8d232c9bf398 RBP: ffff8d232c9bf388 R08: 0000000000000000 R09: ffff8d232e76c600 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000202 R14: ffff8d232c9bf398 R15: ffff99ae86e361a0 FS: 00007f2aa3d7d700(0000) GS:ffff8d232fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 000000046c716000 CR4: 00000000003406e0 Please fix. Thanks, Alex > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/i8254.c | 12 ++++++++++++ > arch/x86/kvm/svm.c | 11 +++++++++-- > 3 files changed, 22 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 4b51222..9cb2d2e 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -853,6 +853,7 @@ enum kvm_irqchip_mode { > #define APICV_INHIBIT_REASON_HYPERV 1 > #define APICV_INHIBIT_REASON_NESTED 2 > #define APICV_INHIBIT_REASON_IRQWIN 3 > +#define APICV_INHIBIT_REASON_PIT_REINJ 4 > > struct kvm_arch { > unsigned long n_used_mmu_pages; > diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c > index 4a6dc54..b24c606 100644 > --- a/arch/x86/kvm/i8254.c > +++ b/arch/x86/kvm/i8254.c > @@ -295,12 +295,24 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) > if (atomic_read(&ps->reinject) == reinject) > return; > > + /* > + * AMD SVM AVIC accelerates EOI write and does not trap. > + * This cause in-kernel PIT re-inject mode to fail > + * since it checks ps->irq_ack before kvm_set_irq() > + * and relies on the ack notifier to timely queue > + * the pt->worker work iterm and reinject the missed tick. > + * So, deactivate APICv when PIT is in reinject mode. > + */ > if (reinject) { > + kvm_request_apicv_update(kvm, false, > + APICV_INHIBIT_REASON_PIT_REINJ); > /* The initial state is preserved while ps->reinject == 0. */ > kvm_pit_reset_reinject(pit); > kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); > kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); > } else { > + kvm_request_apicv_update(kvm, true, > + APICV_INHIBIT_REASON_PIT_REINJ); > kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); > kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); > } > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index b7883b3..2dfdd7c 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -1684,7 +1684,13 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) > int ret = 0; > > mutex_lock(&kvm->slots_lock); > - if (kvm->arch.apic_access_page_done == activate) > + /* > + * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger > + * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT > + * memory region. So, we need to ensure that kvm->mm == current->mm. > + */ > + if ((kvm->arch.apic_access_page_done == activate) || > + (kvm->mm != current->mm)) > goto out; > > ret = __x86_set_memory_region(kvm, > @@ -7281,7 +7287,8 @@ static bool svm_check_apicv_inhibit_reasons(ulong bit) > ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | > BIT(APICV_INHIBIT_REASON_HYPERV) | > BIT(APICV_INHIBIT_REASON_NESTED) | > - BIT(APICV_INHIBIT_REASON_IRQWIN); > + BIT(APICV_INHIBIT_REASON_IRQWIN) | > + BIT(APICV_INHIBIT_REASON_PIT_REINJ); > > return supported & BIT(bit); > }