On Mon, Jan 22, 2024, isaku.yamahata@xxxxxxxxx wrote: > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 4cbcedff4f16..1a5a91b99de9 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -591,6 +591,69 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, > return 0; > } > > +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, > + enum pg_level level, kvm_pfn_t pfn) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + hpa_t hpa = pfn_to_hpa(pfn); > + gpa_t gpa = gfn_to_gpa(gfn); > + struct tdx_module_args out; > + hpa_t source_pa; > + bool measure; > + u64 err; > + int i; > + > + /* > + * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K page > + * because tdh_mem_page_add() supports only 4K page. > + */ > + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) > + return -EINVAL; > + > + /* > + * In case of TDP MMU, fault handler can run concurrently. Note > + * 'source_pa' is a TD scope variable, meaning if there are multiple > + * threads reaching here with all needing to access 'source_pa', it > + * will break. However fortunately this won't happen, because below > + * TDH_MEM_PAGE_ADD code path is only used when VM is being created > + * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl (which > + * always uses vcpu 0's page table and protected by vcpu->mutex). > + */ Most of the above is superflous. tdx_mem_page_add() is called if and only if the TD is finalized, and the TDX module disallow running vCPUs before the TD is finalized. That's it. And maybe throw in a lockdep to assert that kvm->lock is held. > + if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) { > + tdx_unpin(kvm, pfn); > + return -EINVAL; > + } > + > + source_pa = kvm_tdx->source_pa & ~KVM_TDX_MEASURE_MEMORY_REGION; > + measure = kvm_tdx->source_pa & KVM_TDX_MEASURE_MEMORY_REGION; > + kvm_tdx->source_pa = INVALID_PAGE; > + > + do { > + err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa, source_pa, > + &out); > + /* > + * This path is executed during populating initial guest memory > + * image. i.e. before running any vcpu. Race is rare. How are races possible at all? > + */ > + } while (unlikely(err == TDX_ERROR_SEPT_BUSY)); > + if (KVM_BUG_ON(err, kvm)) { > + pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out); > + tdx_unpin(kvm, pfn); > + return -EIO; > + } else if (measure) { > + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { > + err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out); > + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { > + pr_tdx_error(TDH_MR_EXTEND, err, &out); > + break; > + } > + } Why is measurement done deep within the MMU? At a glance, I don't see why this can't be done up in the ioctl, outside of a spinlock. And IIRC, the order affects the measurement but doesn't truly matter, e.g. KVM could choose to completely separate tdh_mr_extend() from tdh_mem_page_add(), no? > +static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd *cmd) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > + struct kvm_tdx_init_mem_region region; > + struct kvm_vcpu *vcpu; > + struct page *page; > + int idx, ret = 0; > + bool added = false; > + > + /* Once TD is finalized, the initial guest memory is fixed. */ > + if (is_td_finalized(kvm_tdx)) > + return -EINVAL; > + > + /* The BSP vCPU must be created before initializing memory regions. */ > + if (!atomic_read(&kvm->online_vcpus)) > + return -EINVAL; > + > + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) > + return -EINVAL; > + > + if (copy_from_user(®ion, (void __user *)cmd->data, sizeof(region))) > + return -EFAULT; > + > + /* Sanity check */ > + if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) || > + !IS_ALIGNED(region.gpa, PAGE_SIZE) || > + !region.nr_pages || > + region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) || > + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || > + !kvm_is_private_gpa(kvm, region.gpa) || > + !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT))) > + return -EINVAL; > + > + vcpu = kvm_get_vcpu(kvm, 0); > + if (mutex_lock_killable(&vcpu->mutex)) > + return -EINTR; The real reason for this drive-by pseudo-review is that I am hoping/wishing we can turn this into a generic KVM ioctl() to allow userspace to pre-map guest memory[*]. If we're going to carry non-trivial code, we might as well squeeze as much use out of it as we can. Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why this is a VM ioctl() and not a vCPU ioctl()? Very roughly, couldn't we use a struct like this as input to a vCPU ioctl() that maps memory, and optionally initializes memory from @source? struct kvm_memory_mapping { __u64 base_gfn; __u64 nr_pages; __u64 flags; __u64 source; } TDX would need to do special things for copying the source, but beyond that most of the code in this function is generic. [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@xxxxxxxxx