On Wed, Dec 14, 2022 at 01:39:58PM -0600, Michael Roth wrote: > In some cases, like with SEV-SNP, guest memory needs to be updated in a > platform-specific manner before it can be safely freed back to the host. > Add hooks to wire up handling of this sort to the invalidation notifiers > for restricted memory. > > Also issue invalidations of all allocated pages during notifier > unregistration so that the pages are not left in an unusable state when > they eventually get freed back to the host upon FD release. > > Signed-off-by: Michael Roth <michael.roth@xxxxxxx> > --- > arch/x86/include/asm/kvm-x86-ops.h | 1 + > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/mmu/mmu.c | 5 +++++ > include/linux/kvm_host.h | 2 ++ > mm/restrictedmem.c | 16 ++++++++++++++++ > virt/kvm/kvm_main.c | 5 +++++ > 6 files changed, 30 insertions(+) > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h > index 52f94a0ba5e9..c71df44b0f02 100644 > --- a/arch/x86/include/asm/kvm-x86-ops.h > +++ b/arch/x86/include/asm/kvm-x86-ops.h > @@ -134,6 +134,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); > KVM_X86_OP_OPTIONAL_RET0(private_mem_enabled); > KVM_X86_OP_OPTIONAL_RET0(fault_is_private); > KVM_X86_OP_OPTIONAL_RET0(update_mem_attr) > +KVM_X86_OP_OPTIONAL(invalidate_restricted_mem) > > #undef KVM_X86_OP > #undef KVM_X86_OP_OPTIONAL > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 13802389f0f9..9ef8d73455d9 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1639,6 +1639,7 @@ struct kvm_x86_ops { > int (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault); > int (*update_mem_attr)(struct kvm_memory_slot *slot, unsigned int attr, > gfn_t start, gfn_t end); > + void (*invalidate_restricted_mem)(struct kvm_memory_slot *slot, gfn_t start, gfn_t end); > > bool (*has_wbinvd_exit)(void); > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index a0c41d391547..2713632e5061 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -7183,3 +7183,8 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm, > kvm_update_lpage_private_shared_mixed(kvm, slot, attrs, > start, end); > } > + > +void kvm_arch_invalidate_restricted_mem(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) > +{ > + static_call_cond(kvm_x86_invalidate_restricted_mem)(slot, start, end); > +} > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index f032d878e034..f72a2e0b8699 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2327,6 +2327,7 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm, > struct kvm_memory_slot *slot, > unsigned long attrs, > gfn_t start, gfn_t end); > + > #else > static inline void kvm_arch_set_memory_attributes(struct kvm *kvm, > struct kvm_memory_slot *slot, > @@ -2366,6 +2367,7 @@ static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot, > } > > void kvm_arch_memory_mce(struct kvm *kvm); > +void kvm_arch_invalidate_restricted_mem(struct kvm_memory_slot *slot, gfn_t start, gfn_t end); > #endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */ > > #endif > diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c > index 56953c204e5c..74fa2cfb8618 100644 > --- a/mm/restrictedmem.c > +++ b/mm/restrictedmem.c > @@ -54,6 +54,11 @@ static int restrictedmem_release(struct inode *inode, struct file *file) > { > struct restrictedmem_data *data = inode->i_mapping->private_data; > > + pr_debug("%s: releasing memfd, invalidating page offsets 0x0-0x%llx\n", > + __func__, inode->i_size >> PAGE_SHIFT); > + restrictedmem_invalidate_start(data, 0, inode->i_size >> PAGE_SHIFT); > + restrictedmem_invalidate_end(data, 0, inode->i_size >> PAGE_SHIFT); > + > fput(data->memfd); > kfree(data); > return 0; > @@ -258,6 +263,17 @@ void restrictedmem_unregister_notifier(struct file *file, > struct restrictedmem_notifier *notifier) > { > struct restrictedmem_data *data = file->f_mapping->private_data; > + struct inode *inode = file_inode(data->memfd); > + > + /* TODO: this will issue notifications to all registered notifiers, > + * but it's only the one being unregistered that needs to process > + * invalidations for any ranges still allocated at this point in > + * time. For now this relies on KVM currently being the only notifier. > + */ > + pr_debug("%s: unregistering notifier, invalidating page offsets 0x0-0x%llx\n", > + __func__, inode->i_size >> PAGE_SHIFT); > + restrictedmem_invalidate_start(data, 0, inode->i_size >> PAGE_SHIFT); > + restrictedmem_invalidate_end(data, 0, inode->i_size >> PAGE_SHIFT); > > mutex_lock(&data->lock); > list_del(¬ifier->list); > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index d2d829d23442..d2daa049e94a 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -974,6 +974,9 @@ static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *no > &gfn_start, &gfn_end)) > return; > > + pr_debug("%s: start: 0x%lx, end: 0x%lx, roffset: 0x%llx, gfn_start: 0x%llx, gfn_end: 0x%llx\n", > + __func__, start, end, slot->restricted_offset, gfn_start, gfn_end); > + > gfn_range.start = gfn_start; > gfn_range.end = gfn_end; > gfn_range.slot = slot; > @@ -988,6 +991,8 @@ static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *no > if (kvm_unmap_gfn_range(kvm, &gfn_range)) > kvm_flush_remote_tlbs(kvm); > > + kvm_arch_invalidate_restricted_mem(slot, gfn_start, gfn_end); Calling kvm_arch_invalidate_restricted_mem while the KVM MMU lock is taken causes problems, because taking said lock disables preemption. Within kvm_arch_invalidate_restricted_mem a few calls down, eventually vm_unmap_aliases is called which tries to lock a mutex, which shouldn't happen with preemption disabled. This causes a "scheduling while atomic" bug: [ 152.846596] BUG: scheduling while atomic: enarx/8302/0x00000002 [ 152.846599] Modules linked in: nf_conntrack_netlink(E) xfrm_user(E) xfrm_algo(E) xt_addrtype(E) br_netfilter(E) xt_CHECKSUM(E) xt_MASQUERADE(E) xt_conntrack(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_tcpudp(E) nft_compat(E) nft_chain_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) nf_tables(E) libcrc32c(E) nfnetlink(E) bridge(E) stp(E) llc(E) bonding(E) intel_rapl_msr(E) intel_rapl_common(E) amd64_edac(E) edac_mce_amd(E) kvm_amd(E) tun(E) ipmi_ssif(E) rfkill(E) overlay(E) ghash_clmulni_intel(E) sha512_ssse3(E) sha512_generic(E) aesni_intel(E) libaes(E) crypto_simd(E) cryptd(E) rapl(E) wmi_bmof(E) binfmt_misc(E) kvm(E) irqbypass(E) nls_ascii(E) nls_cp437(E) vfat(E) fat(E) snd_usb_audio(E) snd_usbmidi_lib(E) snd_hwdep(E) mc(E) snd_pcm(E) snd_seq_midi(E) snd_seq_midi_event(E) snd_rawmidi(E) snd_seq(E) ast(E) snd_seq_device(E) drm_vram_helper(E) drm_ttm_helper(E) snd_timer(E) ttm(E) joydev(E) snd(E) ccp(E) drm_kms_helper(E) soundcore(E) sg(E) i2c_algo_bit(E) rng_core(E) [ 152.846629] k10temp(E) evdev(E) acpi_ipmi(E) ipmi_si(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_cpufreq(E) button(E) squashfs(E) loop(E) sch_fq_codel(E) msr(E) parport_pc(E) ppdev(E) lp(E) ramoops(E) parport(E) reed_solomon(E) fuse(E) drm(E) efi_pstore(E) configfs(E) efivarfs(E) ip_tables(E) x_tables(E) autofs4(E) ext4(E) crc32c_generic(E) crc16(E) mbcache(E) jbd2(E) rndis_host(E) cdc_ether(E) usbnet(E) mii(E) hid_generic(E) usbhid(E) hid(E) sd_mod(E) t10_pi(E) crc64_rocksoft(E) crc64(E) crc_t10dif(E) crct10dif_generic(E) crct10dif_pclmul(E) crct10dif_common(E) crc32_pclmul(E) crc32c_intel(E) ahci(E) libahci(E) xhci_pci(E) libata(E) bnxt_en(E) xhci_hcd(E) scsi_mod(E) ptp(E) scsi_common(E) pps_core(E) usbcore(E) i2c_piix4(E) usb_common(E) wmi(E) [ 152.846657] Preemption disabled at: [ 152.846657] [<ffffffffc146a09a>] kvm_restrictedmem_invalidate_begin+0xba/0x1c0 [kvm] [ 152.846688] CPU: 108 PID: 8302 Comm: enarx Tainted: G W E 6.1.0-rc4+ #30 [ 152.846690] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.4 04/14/2022 [ 152.846691] Call Trace: [ 152.846692] <TASK> [ 152.846694] dump_stack_lvl+0x49/0x63 [ 152.846695] ? kvm_restrictedmem_invalidate_begin+0xba/0x1c0 [kvm] [ 152.846723] dump_stack+0x10/0x16 [ 152.846725] __schedule_bug.cold+0x81/0x92 [ 152.846727] __schedule+0x809/0xa00 [ 152.846729] ? asm_sysvec_call_function+0x1b/0x20 [ 152.846731] schedule+0x6b/0xf0 [ 152.846733] schedule_preempt_disabled+0x18/0x30 [ 152.846735] __mutex_lock.constprop.0+0x723/0x750 [ 152.846738] ? smp_call_function_many_cond+0xc1/0x2e0 [ 152.846740] __mutex_lock_slowpath+0x13/0x20 [ 152.846742] mutex_lock+0x49/0x60 [ 152.846744] _vm_unmap_aliases+0x10e/0x160 [ 152.846746] vm_unmap_aliases+0x19/0x20 [ 152.846748] change_page_attr_set_clr+0xb7/0x1c0 [ 152.846751] set_memory_p+0x29/0x30 [ 152.846753] rmpupdate+0xd5/0x110 [ 152.846756] rmp_make_shared+0xb7/0xc0 [ 152.846758] snp_make_page_shared.constprop.0+0x4c/0x90 [kvm_amd] [ 152.846765] sev_invalidate_private_range+0x156/0x330 [kvm_amd] [ 152.846770] ? kvm_unmap_gfn_range+0xef/0x100 [kvm] [ 152.846801] kvm_arch_invalidate_restricted_mem+0xe/0x20 [kvm] [ 152.846829] kvm_restrictedmem_invalidate_begin+0x106/0x1c0 [kvm] [ 152.846856] restrictedmem_unregister_notifier+0x74/0x150 [ 152.846859] kvm_free_memslot+0x6b/0x80 [kvm] [ 152.846885] kvm_free_memslots.part.0+0x47/0x70 [kvm] [ 152.846911] kvm_destroy_vm+0x222/0x320 [kvm] [ 152.846937] kvm_put_kvm+0x2a/0x50 [kvm] [ 152.846964] kvm_vm_release+0x22/0x30 [kvm] [ 152.846990] __fput+0xa8/0x280 [ 152.846992] ____fput+0xe/0x20 [ 152.846994] task_work_run+0x61/0xb0 [ 152.846996] do_exit+0x362/0xb30 [ 152.846998] ? tomoyo_path_number_perm+0x6f/0x200 [ 152.847001] do_group_exit+0x38/0xa0 [ 152.847003] get_signal+0x999/0x9c0 [ 152.847005] arch_do_signal_or_restart+0x37/0x7e0 [ 152.847008] ? __might_fault+0x26/0x30 [ 152.847010] ? __rseq_handle_notify_resume+0xd5/0x4f0 [ 152.847013] exit_to_user_mode_prepare+0xd3/0x170 [ 152.847016] syscall_exit_to_user_mode+0x26/0x50 [ 152.847019] do_syscall_64+0x48/0x90 [ 152.847020] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 152.847022] RIP: 0033:0x7fa345f1aaff [ 152.847023] Code: Unable to access opcode bytes at 0x7fa345f1aad5. [ 152.847024] RSP: 002b:00007fff99d6c050 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 152.847026] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00007fa345f1aaff [ 152.847027] RDX: 00007fff99d6c188 RSI: 00000000c008aeba RDI: 0000000000000006 [ 152.847028] RBP: 00007fff99576000 R08: 0000000000000000 R09: 0000000000000000 [ 152.847029] R10: 0000000001680000 R11: 0000000000000246 R12: 00007fff99d752c0 [ 152.847030] R13: 00007fff99d75270 R14: 0000000000000000 R15: 00007fff99577000 [ 152.847032] </TASK> This bug can be triggered by destroying multiple SNP VMs at the same time. > + > KVM_MMU_UNLOCK(kvm); > srcu_read_unlock(&kvm->srcu, idx); > } > -- > 2.25.1 > > Regards, Tom