Re: [PATCH RFC v7 06/64] KVM: x86: Add platform hooks for private memory invalidations

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Dec 14, 2022 at 01:39:58PM -0600, Michael Roth wrote:
> In some cases, like with SEV-SNP, guest memory needs to be updated in a
> platform-specific manner before it can be safely freed back to the host.
> Add hooks to wire up handling of this sort to the invalidation notifiers
> for restricted memory.
> 
> Also issue invalidations of all allocated pages during notifier
> unregistration so that the pages are not left in an unusable state when
> they eventually get freed back to the host upon FD release.
> 
> Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  1 +
>  arch/x86/include/asm/kvm_host.h    |  1 +
>  arch/x86/kvm/mmu/mmu.c             |  5 +++++
>  include/linux/kvm_host.h           |  2 ++
>  mm/restrictedmem.c                 | 16 ++++++++++++++++
>  virt/kvm/kvm_main.c                |  5 +++++
>  6 files changed, 30 insertions(+)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 52f94a0ba5e9..c71df44b0f02 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -134,6 +134,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
>  KVM_X86_OP_OPTIONAL_RET0(private_mem_enabled);
>  KVM_X86_OP_OPTIONAL_RET0(fault_is_private);
>  KVM_X86_OP_OPTIONAL_RET0(update_mem_attr)
> +KVM_X86_OP_OPTIONAL(invalidate_restricted_mem)
>  
>  #undef KVM_X86_OP
>  #undef KVM_X86_OP_OPTIONAL
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 13802389f0f9..9ef8d73455d9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1639,6 +1639,7 @@ struct kvm_x86_ops {
>  	int (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault);
>  	int (*update_mem_attr)(struct kvm_memory_slot *slot, unsigned int attr,
>  			       gfn_t start, gfn_t end);
> +	void (*invalidate_restricted_mem)(struct kvm_memory_slot *slot, gfn_t start, gfn_t end);
>  
>  	bool (*has_wbinvd_exit)(void);
>  
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index a0c41d391547..2713632e5061 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7183,3 +7183,8 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
>  		kvm_update_lpage_private_shared_mixed(kvm, slot, attrs,
>  						      start, end);
>  }
> +
> +void kvm_arch_invalidate_restricted_mem(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> +{
> +	static_call_cond(kvm_x86_invalidate_restricted_mem)(slot, start, end);
> +}
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f032d878e034..f72a2e0b8699 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2327,6 +2327,7 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
>  				    struct kvm_memory_slot *slot,
>  				    unsigned long attrs,
>  				    gfn_t start, gfn_t end);
> +
>  #else
>  static inline void kvm_arch_set_memory_attributes(struct kvm *kvm,
>  						  struct kvm_memory_slot *slot,
> @@ -2366,6 +2367,7 @@ static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot,
>  }
>  
>  void kvm_arch_memory_mce(struct kvm *kvm);
> +void kvm_arch_invalidate_restricted_mem(struct kvm_memory_slot *slot, gfn_t start, gfn_t end);
>  #endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */
>  
>  #endif
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> index 56953c204e5c..74fa2cfb8618 100644
> --- a/mm/restrictedmem.c
> +++ b/mm/restrictedmem.c
> @@ -54,6 +54,11 @@ static int restrictedmem_release(struct inode *inode, struct file *file)
>  {
>  	struct restrictedmem_data *data = inode->i_mapping->private_data;
>  
> +	pr_debug("%s: releasing memfd, invalidating page offsets 0x0-0x%llx\n",
> +		 __func__, inode->i_size >> PAGE_SHIFT);
> +	restrictedmem_invalidate_start(data, 0, inode->i_size >> PAGE_SHIFT);
> +	restrictedmem_invalidate_end(data, 0, inode->i_size >> PAGE_SHIFT);
> +
>  	fput(data->memfd);
>  	kfree(data);
>  	return 0;
> @@ -258,6 +263,17 @@ void restrictedmem_unregister_notifier(struct file *file,
>  				       struct restrictedmem_notifier *notifier)
>  {
>  	struct restrictedmem_data *data = file->f_mapping->private_data;
> +	struct inode *inode = file_inode(data->memfd);
> +
> +	/* TODO: this will issue notifications to all registered notifiers,
> +	 * but it's only the one being unregistered that needs to process
> +	 * invalidations for any ranges still allocated at this point in
> +	 * time. For now this relies on KVM currently being the only notifier.
> +	 */
> +	pr_debug("%s: unregistering notifier, invalidating page offsets 0x0-0x%llx\n",
> +		 __func__, inode->i_size >> PAGE_SHIFT);
> +	restrictedmem_invalidate_start(data, 0, inode->i_size >> PAGE_SHIFT);
> +	restrictedmem_invalidate_end(data, 0, inode->i_size >> PAGE_SHIFT);
>  
>  	mutex_lock(&data->lock);
>  	list_del(&notifier->list);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d2d829d23442..d2daa049e94a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -974,6 +974,9 @@ static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *no
>  					  &gfn_start, &gfn_end))
>  		return;
>  
> +	pr_debug("%s: start: 0x%lx, end: 0x%lx, roffset: 0x%llx, gfn_start: 0x%llx, gfn_end: 0x%llx\n",
> +		 __func__, start, end, slot->restricted_offset, gfn_start, gfn_end);
> +
>  	gfn_range.start = gfn_start;
>  	gfn_range.end = gfn_end;
>  	gfn_range.slot = slot;
> @@ -988,6 +991,8 @@ static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *no
>  	if (kvm_unmap_gfn_range(kvm, &gfn_range))
>  		kvm_flush_remote_tlbs(kvm);
>  
> +	kvm_arch_invalidate_restricted_mem(slot, gfn_start, gfn_end);

Calling kvm_arch_invalidate_restricted_mem while the KVM MMU lock is taken
causes problems, because taking said lock disables preemption. Within 
kvm_arch_invalidate_restricted_mem a few calls down, eventually
vm_unmap_aliases is called which tries to lock a mutex, which shouldn't happen
with preemption disabled. This causes a "scheduling while atomic" bug:

[  152.846596] BUG: scheduling while atomic: enarx/8302/0x00000002
[  152.846599] Modules linked in: nf_conntrack_netlink(E) xfrm_user(E) xfrm_algo(E) xt_addrtype(E) br_netfilter(E) xt_CHECKSUM(E) xt_MASQUERADE(E) xt_conntrack(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_tcpudp(E) nft_compat(E) nft_chain_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) nf_tables(E) libcrc32c(E) nfnetlink(E) bridge(E) stp(E) llc(E) bonding(E) intel_rapl_msr(E) intel_rapl_common(E) amd64_edac(E) edac_mce_amd(E) kvm_amd(E) tun(E) ipmi_ssif(E) rfkill(E) overlay(E) ghash_clmulni_intel(E) sha512_ssse3(E) sha512_generic(E) aesni_intel(E) libaes(E) crypto_simd(E) cryptd(E) rapl(E) wmi_bmof(E) binfmt_misc(E) kvm(E) irqbypass(E) nls_ascii(E) nls_cp437(E) vfat(E) fat(E) snd_usb_audio(E) snd_usbmidi_lib(E) snd_hwdep(E) mc(E) snd_pcm(E) snd_seq_midi(E) snd_seq_midi_event(E) snd_rawmidi(E) snd_seq(E) ast(E) snd_seq_device(E) drm_vram_helper(E) drm_ttm_helper(E) snd_timer(E) ttm(E) joydev(E) snd(E) ccp(E) drm_kms_helper(E) soundcore(E) sg(E) i2c_algo_bit(E) rng_core(E)
[  152.846629]  k10temp(E) evdev(E) acpi_ipmi(E) ipmi_si(E) ipmi_devintf(E) ipmi_msghandler(E) acpi_cpufreq(E) button(E) squashfs(E) loop(E) sch_fq_codel(E) msr(E) parport_pc(E) ppdev(E) lp(E) ramoops(E) parport(E) reed_solomon(E) fuse(E) drm(E) efi_pstore(E) configfs(E) efivarfs(E) ip_tables(E) x_tables(E) autofs4(E) ext4(E) crc32c_generic(E) crc16(E) mbcache(E) jbd2(E) rndis_host(E) cdc_ether(E) usbnet(E) mii(E) hid_generic(E) usbhid(E) hid(E) sd_mod(E) t10_pi(E) crc64_rocksoft(E) crc64(E) crc_t10dif(E) crct10dif_generic(E) crct10dif_pclmul(E) crct10dif_common(E) crc32_pclmul(E) crc32c_intel(E) ahci(E) libahci(E) xhci_pci(E) libata(E) bnxt_en(E) xhci_hcd(E) scsi_mod(E) ptp(E) scsi_common(E) pps_core(E) usbcore(E) i2c_piix4(E) usb_common(E) wmi(E)
[  152.846657] Preemption disabled at:
[  152.846657] [<ffffffffc146a09a>] kvm_restrictedmem_invalidate_begin+0xba/0x1c0 [kvm]
[  152.846688] CPU: 108 PID: 8302 Comm: enarx Tainted: G        W   E      6.1.0-rc4+ #30
[  152.846690] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.4 04/14/2022
[  152.846691] Call Trace:
[  152.846692]  <TASK>
[  152.846694]  dump_stack_lvl+0x49/0x63
[  152.846695]  ? kvm_restrictedmem_invalidate_begin+0xba/0x1c0 [kvm]
[  152.846723]  dump_stack+0x10/0x16
[  152.846725]  __schedule_bug.cold+0x81/0x92
[  152.846727]  __schedule+0x809/0xa00
[  152.846729]  ? asm_sysvec_call_function+0x1b/0x20
[  152.846731]  schedule+0x6b/0xf0
[  152.846733]  schedule_preempt_disabled+0x18/0x30
[  152.846735]  __mutex_lock.constprop.0+0x723/0x750
[  152.846738]  ? smp_call_function_many_cond+0xc1/0x2e0
[  152.846740]  __mutex_lock_slowpath+0x13/0x20
[  152.846742]  mutex_lock+0x49/0x60
[  152.846744]  _vm_unmap_aliases+0x10e/0x160
[  152.846746]  vm_unmap_aliases+0x19/0x20
[  152.846748]  change_page_attr_set_clr+0xb7/0x1c0
[  152.846751]  set_memory_p+0x29/0x30
[  152.846753]  rmpupdate+0xd5/0x110
[  152.846756]  rmp_make_shared+0xb7/0xc0
[  152.846758]  snp_make_page_shared.constprop.0+0x4c/0x90 [kvm_amd]
[  152.846765]  sev_invalidate_private_range+0x156/0x330 [kvm_amd]
[  152.846770]  ? kvm_unmap_gfn_range+0xef/0x100 [kvm]
[  152.846801]  kvm_arch_invalidate_restricted_mem+0xe/0x20 [kvm]
[  152.846829]  kvm_restrictedmem_invalidate_begin+0x106/0x1c0 [kvm]
[  152.846856]  restrictedmem_unregister_notifier+0x74/0x150
[  152.846859]  kvm_free_memslot+0x6b/0x80 [kvm]
[  152.846885]  kvm_free_memslots.part.0+0x47/0x70 [kvm]
[  152.846911]  kvm_destroy_vm+0x222/0x320 [kvm]
[  152.846937]  kvm_put_kvm+0x2a/0x50 [kvm]
[  152.846964]  kvm_vm_release+0x22/0x30 [kvm]
[  152.846990]  __fput+0xa8/0x280
[  152.846992]  ____fput+0xe/0x20
[  152.846994]  task_work_run+0x61/0xb0
[  152.846996]  do_exit+0x362/0xb30
[  152.846998]  ? tomoyo_path_number_perm+0x6f/0x200
[  152.847001]  do_group_exit+0x38/0xa0
[  152.847003]  get_signal+0x999/0x9c0
[  152.847005]  arch_do_signal_or_restart+0x37/0x7e0
[  152.847008]  ? __might_fault+0x26/0x30
[  152.847010]  ? __rseq_handle_notify_resume+0xd5/0x4f0
[  152.847013]  exit_to_user_mode_prepare+0xd3/0x170
[  152.847016]  syscall_exit_to_user_mode+0x26/0x50
[  152.847019]  do_syscall_64+0x48/0x90
[  152.847020]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[  152.847022] RIP: 0033:0x7fa345f1aaff
[  152.847023] Code: Unable to access opcode bytes at 0x7fa345f1aad5.
[  152.847024] RSP: 002b:00007fff99d6c050 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[  152.847026] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00007fa345f1aaff
[  152.847027] RDX: 00007fff99d6c188 RSI: 00000000c008aeba RDI: 0000000000000006
[  152.847028] RBP: 00007fff99576000 R08: 0000000000000000 R09: 0000000000000000
[  152.847029] R10: 0000000001680000 R11: 0000000000000246 R12: 00007fff99d752c0
[  152.847030] R13: 00007fff99d75270 R14: 0000000000000000 R15: 00007fff99577000
[  152.847032]  </TASK>

This bug can be triggered by destroying multiple SNP VMs at the same time.

> +
>  	KVM_MMU_UNLOCK(kvm);
>  	srcu_read_unlock(&kvm->srcu, idx);
>  }
> -- 
> 2.25.1
> 
> 

Regards, Tom




[Index of Archives]     [Kernel]     [Gnu Classpath]     [Gnu Crypto]     [DM Crypt]     [Netfilter]     [Bugtraq]
  Powered by Linux