On Fri, 10 Mar 2023 15:47:16 +0000 Steven Price <steven.price@xxxxxxx> wrote: > On 06/03/2023 17:34, Zhi Wang wrote: > > On Fri, 27 Jan 2023 11:29:20 +0000 > > Steven Price <steven.price@xxxxxxx> wrote: > > > >> The VMM needs to populate the realm with some data before starting (e.g. > >> a kernel and initrd). This is measured by the RMM and used as part of > >> the attestation later on. > >> > >> Signed-off-by: Steven Price <steven.price@xxxxxxx> > >> --- > >> arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++ > >> 1 file changed, 366 insertions(+) > >> > >> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c > >> index 16e0bfea98b1..3405b43e1421 100644 > >> --- a/arch/arm64/kvm/rme.c > >> +++ b/arch/arm64/kvm/rme.c > >> @@ -4,6 +4,7 @@ > >> */ > >> > >> #include <linux/kvm_host.h> > >> +#include <linux/hugetlb.h> > >> > >> #include <asm/kvm_emulate.h> > >> #include <asm/kvm_mmu.h> > >> @@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size) > >> } > >> } > >> > >> +static int realm_create_protected_data_page(struct realm *realm, > >> + unsigned long ipa, > >> + struct page *dst_page, > >> + struct page *tmp_page) > >> +{ > >> + phys_addr_t dst_phys, tmp_phys; > >> + int ret; > >> + > >> + copy_page(page_address(tmp_page), page_address(dst_page)); > >> + > >> + dst_phys = page_to_phys(dst_page); > >> + tmp_phys = page_to_phys(tmp_page); > >> + > >> + if (rmi_granule_delegate(dst_phys)) > >> + return -ENXIO; > >> + > >> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys, > >> + RMI_MEASURE_CONTENT); > >> + > >> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { > >> + /* Create missing RTTs and retry */ > >> + int level = RMI_RETURN_INDEX(ret); > >> + > >> + ret = realm_create_rtt_levels(realm, ipa, level, > >> + RME_RTT_MAX_LEVEL, NULL); > >> + if (ret) > >> + goto err; > >> + > >> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, > >> + tmp_phys, RMI_MEASURE_CONTENT); > >> + } > >> + > >> + if (ret) > >> + goto err; > >> + > >> + return 0; > >> + > >> +err: > >> + if (WARN_ON(rmi_granule_undelegate(dst_phys))) { > >> + /* Page can't be returned to NS world so is lost */ > >> + get_page(dst_page); > >> + } > >> + return -ENXIO; > >> +} > >> + > >> +static int fold_rtt(phys_addr_t rd, unsigned long addr, int level, > >> + struct realm *realm) > >> +{ > >> + struct rtt_entry rtt; > >> + phys_addr_t rtt_addr; > >> + > >> + if (rmi_rtt_read_entry(rd, addr, level, &rtt)) > >> + return -ENXIO; > >> + > >> + if (rtt.state != RMI_TABLE) > >> + return -EINVAL; > >> + > >> + rtt_addr = rmi_rtt_get_phys(&rtt); > >> + if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1)) > >> + return -ENXIO; > >> + > >> + free_delegated_page(realm, rtt_addr); > >> + > >> + return 0; > >> +} > >> + > >> +int realm_map_protected(struct realm *realm, > >> + unsigned long hva, > >> + unsigned long base_ipa, > >> + struct page *dst_page, > >> + unsigned long map_size, > >> + struct kvm_mmu_memory_cache *memcache) > >> +{ > >> + phys_addr_t dst_phys = page_to_phys(dst_page); > >> + phys_addr_t rd = virt_to_phys(realm->rd); > >> + unsigned long phys = dst_phys; > >> + unsigned long ipa = base_ipa; > >> + unsigned long size; > >> + int map_level; > >> + int ret = 0; > >> + > >> + if (WARN_ON(!IS_ALIGNED(ipa, map_size))) > >> + return -EINVAL; > >> + > >> + switch (map_size) { > >> + case PAGE_SIZE: > >> + map_level = 3; > >> + break; > >> + case RME_L2_BLOCK_SIZE: > >> + map_level = 2; > >> + break; > >> + default: > >> + return -EINVAL; > >> + } > >> + > >> + if (map_level < RME_RTT_MAX_LEVEL) { > >> + /* > >> + * A temporary RTT is needed during the map, precreate it, > >> + * however if there is an error (e.g. missing parent tables) > >> + * this will be handled below. > >> + */ > >> + realm_create_rtt_levels(realm, ipa, map_level, > >> + RME_RTT_MAX_LEVEL, memcache); > >> + } > >> + > >> + for (size = 0; size < map_size; size += PAGE_SIZE) { > >> + if (rmi_granule_delegate(phys)) { > >> + struct rtt_entry rtt; > >> + > >> + /* > >> + * It's possible we raced with another VCPU on the same > >> + * fault. If the entry exists and matches then exit > >> + * early and assume the other VCPU will handle the > >> + * mapping. > >> + */ > >> + if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt)) > >> + goto err; > >> + > >> + // FIXME: For a block mapping this could race at level > >> + // 2 or 3... > >> + if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL || > >> + rtt.state != RMI_ASSIGNED || > >> + rtt.desc != phys))) { > >> + goto err; > >> + } > >> + > >> + return 0; > >> + } > >> + > >> + ret = rmi_data_create_unknown(phys, rd, ipa); > >> + > >> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { > >> + /* Create missing RTTs and retry */ > >> + int level = RMI_RETURN_INDEX(ret); > >> + > >> + ret = realm_create_rtt_levels(realm, ipa, level, > >> + RME_RTT_MAX_LEVEL, > >> + memcache); > >> + WARN_ON(ret); > >> + if (ret) > >> + goto err_undelegate; > >> + > >> + ret = rmi_data_create_unknown(phys, rd, ipa); > >> + } > >> + WARN_ON(ret); > >> + > >> + if (ret) > >> + goto err_undelegate; > >> + > >> + phys += PAGE_SIZE; > >> + ipa += PAGE_SIZE; > >> + } > >> + > >> + if (map_size == RME_L2_BLOCK_SIZE) > >> + ret = fold_rtt(rd, base_ipa, map_level, realm); > >> + if (WARN_ON(ret)) > >> + goto err; > >> + > >> + return 0; > >> + > >> +err_undelegate: > >> + if (WARN_ON(rmi_granule_undelegate(phys))) { > >> + /* Page can't be returned to NS world so is lost */ > >> + get_page(phys_to_page(phys)); > >> + } > >> +err: > >> + while (size > 0) { > >> + phys -= PAGE_SIZE; > >> + size -= PAGE_SIZE; > >> + ipa -= PAGE_SIZE; > >> + > >> + rmi_data_destroy(rd, ipa); > >> + > >> + if (WARN_ON(rmi_granule_undelegate(phys))) { > >> + /* Page can't be returned to NS world so is lost */ > >> + get_page(phys_to_page(phys)); > >> + } > >> + } > >> + return -ENXIO; > >> +} > >> + > > > > There seems no caller to the function above. Better move it to the related > > patch. > > Indeed this should really be in the next patch - will move as it's very > confusing having it in this patch (sorry about that). > > >> +static int populate_par_region(struct kvm *kvm, > >> + phys_addr_t ipa_base, > >> + phys_addr_t ipa_end) > >> +{ > >> + struct realm *realm = &kvm->arch.realm; > >> + struct kvm_memory_slot *memslot; > >> + gfn_t base_gfn, end_gfn; > >> + int idx; > >> + phys_addr_t ipa; > >> + int ret = 0; > >> + struct page *tmp_page; > >> + phys_addr_t rd = virt_to_phys(realm->rd); > >> + > >> + base_gfn = gpa_to_gfn(ipa_base); > >> + end_gfn = gpa_to_gfn(ipa_end); > >> + > >> + idx = srcu_read_lock(&kvm->srcu); > >> + memslot = gfn_to_memslot(kvm, base_gfn); > >> + if (!memslot) { > >> + ret = -EFAULT; > >> + goto out; > >> + } > >> + > >> + /* We require the region to be contained within a single memslot */ > >> + if (memslot->base_gfn + memslot->npages < end_gfn) { > >> + ret = -EINVAL; > >> + goto out; > >> + } > >> + > >> + tmp_page = alloc_page(GFP_KERNEL); > >> + if (!tmp_page) { > >> + ret = -ENOMEM; > >> + goto out; > >> + } > >> + > >> + mmap_read_lock(current->mm); > >> + > >> + ipa = ipa_base; > >> + > >> + while (ipa < ipa_end) { > >> + struct vm_area_struct *vma; > >> + unsigned long map_size; > >> + unsigned int vma_shift; > >> + unsigned long offset; > >> + unsigned long hva; > >> + struct page *page; > >> + kvm_pfn_t pfn; > >> + int level; > >> + > >> + hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa)); > >> + vma = vma_lookup(current->mm, hva); > >> + if (!vma) { > >> + ret = -EFAULT; > >> + break; > >> + } > >> + > >> + if (is_vm_hugetlb_page(vma)) > >> + vma_shift = huge_page_shift(hstate_vma(vma)); > >> + else > >> + vma_shift = PAGE_SHIFT; > >> + > >> + map_size = 1 << vma_shift; > >> + > >> + /* > >> + * FIXME: This causes over mapping, but there's no good > >> + * solution here with the ABI as it stands > >> + */ > >> + ipa = ALIGN_DOWN(ipa, map_size); > >> + > >> + switch (map_size) { > >> + case RME_L2_BLOCK_SIZE: > >> + level = 2; > >> + break; > >> + case PAGE_SIZE: > >> + level = 3; > >> + break; > >> + default: > >> + WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift); > >> + ret = -EFAULT; > >> + break; > >> + } > >> + > >> + pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa)); > >> + > >> + if (is_error_pfn(pfn)) { > >> + ret = -EFAULT; > >> + break; > >> + } > >> + > >> + ret = rmi_rtt_init_ripas(rd, ipa, level); > >> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { > >> + ret = realm_create_rtt_levels(realm, ipa, > >> + RMI_RETURN_INDEX(ret), > >> + level, NULL); > >> + if (ret) > >> + break; > >> + ret = rmi_rtt_init_ripas(rd, ipa, level); > >> + if (ret) { > >> + ret = -ENXIO; > >> + break; > >> + } > >> + } > >> + > >> + if (level < RME_RTT_MAX_LEVEL) { > >> + /* > >> + * A temporary RTT is needed during the map, precreate > >> + * it, however if there is an error (e.g. missing > >> + * parent tables) this will be handled in the > >> + * realm_create_protected_data_page() call. > >> + */ > >> + realm_create_rtt_levels(realm, ipa, level, > >> + RME_RTT_MAX_LEVEL, NULL); > >> + } > >> + > >> + page = pfn_to_page(pfn); > >> + > >> + for (offset = 0; offset < map_size && !ret; > >> + offset += PAGE_SIZE, page++) { > >> + phys_addr_t page_ipa = ipa + offset; > >> + > >> + ret = realm_create_protected_data_page(realm, page_ipa, > >> + page, tmp_page); > >> + } > >> + if (ret) > >> + goto err_release_pfn; > >> + > >> + if (level == 2) { > >> + ret = fold_rtt(rd, ipa, level, realm); > >> + if (ret) > >> + goto err_release_pfn; > >> + } > >> + > >> + ipa += map_size; > > > >> + kvm_set_pfn_accessed(pfn); > >> + kvm_set_pfn_dirty(pfn); > > > > kvm_release_pfn_dirty() has already called kvm_set_pfn_{accessed, dirty}(). > > Will remove those calls. > > >> + kvm_release_pfn_dirty(pfn); > >> +err_release_pfn: > >> + if (ret) { > >> + kvm_release_pfn_clean(pfn); > >> + break; > >> + } > >> + } > >> + > >> + mmap_read_unlock(current->mm); > >> + __free_page(tmp_page); > >> + > >> +out: > >> + srcu_read_unlock(&kvm->srcu, idx); > >> + return ret; > >> +} > >> + > >> +static int kvm_populate_realm(struct kvm *kvm, > >> + struct kvm_cap_arm_rme_populate_realm_args *args) > >> +{ > >> + phys_addr_t ipa_base, ipa_end; > >> + > > > > Check kvm_is_realm(kvm) here or in the kvm_realm_enable_cap(). > > I'm going to update kvm_vm_ioctl_enable_cap() to check kvm_is_realm() so > we won't get here. > > >> + if (kvm_realm_state(kvm) != REALM_STATE_NEW) > >> + return -EBUSY; > > > > Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called > > yet). The userspace shouldn't reach this path. > > Well user space can attempt to populate in the ACTIVE state - which is > where the idea of 'busy' comes from. Admittedly it's a little confusing > when RMI_REALM_CREATE hasn't been called. > > I'm not particularly bothered about the return code, but it's useful to > have a different code to -EINVAL as it's not an invalid argument, but > calling at the wrong time. I can't immediately see a better error code > though. > The reason why I feel -EBUSY is little bit off is EBUSY usually indicates something is already initialized and currently running, then another calling path wanna to operate it. I took a look on the ioctls in arch/arm64/kvm/arm.c. It seems people have different opinions for calling execution path at a wrong time: For example: long kvm_arch_vcpu_ioctl() ... case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; struct kvm_reg_list reg_list; unsigned n; r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) break; r = -EPERM; if (!kvm_arm_vcpu_is_finalized(vcpu)) break; If we have to choose one, I prefer -ENOEXEC as -EPERM is stranger. But personally my vote goes to -EINVAL. > Steve > > >> + > >> + if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) || > >> + !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE)) > >> + return -EINVAL; > >> + > >> + ipa_base = args->populate_ipa_base; > >> + ipa_end = ipa_base + args->populate_ipa_size; > >> + > >> + if (ipa_end < ipa_base) > >> + return -EINVAL; > >> + > >> + return populate_par_region(kvm, ipa_base, ipa_end); > >> +} > >> + > >> static int set_ipa_state(struct kvm_vcpu *vcpu, > >> unsigned long ipa, > >> unsigned long end, > >> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) > >> r = kvm_init_ipa_range_realm(kvm, &args); > >> break; > >> } > >> + case KVM_CAP_ARM_RME_POPULATE_REALM: { > >> + struct kvm_cap_arm_rme_populate_realm_args args; > >> + void __user *argp = u64_to_user_ptr(cap->args[1]); > >> + > >> + if (copy_from_user(&args, argp, sizeof(args))) { > >> + r = -EFAULT; > >> + break; > >> + } > >> + > >> + r = kvm_populate_realm(kvm, &args); > >> + break; > >> + } > >> default: > >> r = -EINVAL; > >> break; > > >