Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx> writes: > In memory encryption usage, guest memory may be encrypted with special > key and can be accessed only by the guest itself. We call such memory > private memory. It's valueless and sometimes can cause problem to allow > userspace to access guest private memory. This new KVM memslot extension > allows guest private memory being provided though a restrictedmem > backed file descriptor(fd) and userspace is restricted to access the > bookmarked memory in the fd. > <snip> > To make code maintenance easy, internally we use a binary compatible > alias struct kvm_user_mem_region to handle both the normal and the > '_ext' variants. > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 0d5d4419139a..f1ae45c10c94 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -103,6 +103,33 @@ struct kvm_userspace_memory_region { > __u64 userspace_addr; /* start of the userspace allocated memory */ > }; > > +struct kvm_userspace_memory_region_ext { > + struct kvm_userspace_memory_region region; > + __u64 restricted_offset; > + __u32 restricted_fd; > + __u32 pad1; > + __u64 pad2[14]; > +}; > + > +#ifdef __KERNEL__ > +/* > + * kvm_user_mem_region is a kernel-only alias of kvm_userspace_memory_region_ext > + * that "unpacks" kvm_userspace_memory_region so that KVM can directly access > + * all fields from the top-level "extended" region. > + */ > +struct kvm_user_mem_region { > + __u32 slot; > + __u32 flags; > + __u64 guest_phys_addr; > + __u64 memory_size; > + __u64 userspace_addr; > + __u64 restricted_offset; > + __u32 restricted_fd; > + __u32 pad1; > + __u64 pad2[14]; > +}; > +#endif I'm not sure I buy the argument this makes the code maintenance easier because you now have multiple places to update if you extend the field. Was this simply to avoid changing: foo->slot to foo->region.slot in the underlying code? > + > /* > * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, > * other bits are reserved for kvm internal use which are defined in > @@ -110,6 +137,7 @@ struct kvm_userspace_memory_region { > */ > #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) > #define KVM_MEM_READONLY (1UL << 1) > +#define KVM_MEM_PRIVATE (1UL << 2) > > /* for KVM_IRQ_LINE */ > struct kvm_irq_level { > @@ -1178,6 +1206,7 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_S390_ZPCI_OP 221 > #define KVM_CAP_S390_CPU_TOPOLOGY 222 > #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 > +#define KVM_CAP_PRIVATE_MEM 224 > > #ifdef KVM_CAP_IRQ_ROUTING > > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index 800f9470e36b..9ff164c7e0cc 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -86,3 +86,6 @@ config KVM_XFER_TO_GUEST_WORK > > config HAVE_KVM_PM_NOTIFIER > bool > + > +config HAVE_KVM_RESTRICTED_MEM > + bool > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index e30f1b4ecfa5..8dace78a0278 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -1526,7 +1526,7 @@ static void kvm_replace_memslot(struct kvm *kvm, > } > } > > -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) > +static int check_memory_region_flags(const struct kvm_user_mem_region *mem) > { > u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; > > @@ -1920,7 +1920,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, > * Must be called holding kvm->slots_lock for write. > */ > int __kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem) > + const struct kvm_user_mem_region *mem) > { > struct kvm_memory_slot *old, *new; > struct kvm_memslots *slots; > @@ -2024,7 +2024,7 @@ int __kvm_set_memory_region(struct kvm *kvm, > EXPORT_SYMBOL_GPL(__kvm_set_memory_region); > > int kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem) > + const struct kvm_user_mem_region *mem) > { > int r; > > @@ -2036,7 +2036,7 @@ int kvm_set_memory_region(struct kvm *kvm, > EXPORT_SYMBOL_GPL(kvm_set_memory_region); > > static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, > - struct kvm_userspace_memory_region *mem) > + struct kvm_user_mem_region *mem) > { > if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) > return -EINVAL; > @@ -4627,6 +4627,33 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) > return fd; > } > > +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ > +do { \ > + BUILD_BUG_ON(offsetof(struct kvm_user_mem_region, field) != \ > + offsetof(struct kvm_userspace_memory_region, field)); \ > + BUILD_BUG_ON(sizeof_field(struct kvm_user_mem_region, field) != \ > + sizeof_field(struct kvm_userspace_memory_region, field)); \ > +} while (0) > + > +#define SANITY_CHECK_MEM_REGION_EXT_FIELD(field) \ > +do { \ > + BUILD_BUG_ON(offsetof(struct kvm_user_mem_region, field) != \ > + offsetof(struct kvm_userspace_memory_region_ext, field)); \ > + BUILD_BUG_ON(sizeof_field(struct kvm_user_mem_region, field) != \ > + sizeof_field(struct kvm_userspace_memory_region_ext, field)); \ > +} while (0) > + > +static void kvm_sanity_check_user_mem_region_alias(void) > +{ > + SANITY_CHECK_MEM_REGION_FIELD(slot); > + SANITY_CHECK_MEM_REGION_FIELD(flags); > + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); > + SANITY_CHECK_MEM_REGION_FIELD(memory_size); > + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); > + SANITY_CHECK_MEM_REGION_EXT_FIELD(restricted_offset); > + SANITY_CHECK_MEM_REGION_EXT_FIELD(restricted_fd); > +} Do we have other examples in the kernel that jump these hoops? > static long kvm_vm_ioctl(struct file *filp, > unsigned int ioctl, unsigned long arg) > { > @@ -4650,14 +4677,20 @@ static long kvm_vm_ioctl(struct file *filp, > break; > } > case KVM_SET_USER_MEMORY_REGION: { > - struct kvm_userspace_memory_region kvm_userspace_mem; > + struct kvm_user_mem_region mem; > + unsigned long size = sizeof(struct kvm_userspace_memory_region); > + > + kvm_sanity_check_user_mem_region_alias(); > > r = -EFAULT; > - if (copy_from_user(&kvm_userspace_mem, argp, > - sizeof(kvm_userspace_mem))) > + if (copy_from_user(&mem, argp, size)) > + goto out; > + > + r = -EINVAL; > + if (mem.flags & KVM_MEM_PRIVATE) > goto out; Hmm I can see in the later code you explicitly check for the KVM_MEM_PRIVATE flag with: if (get_user(flags, (u32 __user *)(argp + flags_offset))) goto out; if (flags & KVM_MEM_PRIVATE) size = sizeof(struct kvm_userspace_memory_region_ext); else size = sizeof(struct kvm_userspace_memory_region); I think it would make sense to bring that sanity checking forward into this patch to avoid the validation logic working in two different ways over the series. > > - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); > + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); > break; > } > case KVM_GET_DIRTY_LOG: { -- Alex Bennée