On 2/8/2021 11:47 AM, Michael Kelley wrote: > From: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx> Sent: Friday, November 20, 2020 4:30 PM >> >> Add ioctls for getting and setting virtual processor registers. >> >> Co-developed-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx> >> Signed-off-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx> >> Signed-off-by: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx> >> --- >> Documentation/virt/mshv/api.rst | 11 + >> arch/x86/include/uapi/asm/hyperv-tlfs.h | 601 ++++++++++++++++++++++++ >> include/asm-generic/hyperv-tlfs.h | 65 +-- >> include/linux/mshv.h | 1 + >> include/uapi/linux/mshv.h | 12 + >> virt/mshv/mshv_main.c | 258 +++++++++- >> 6 files changed, 903 insertions(+), 45 deletions(-) >> [snip] >> + >> +union hv_register_value { >> + struct hv_u128 reg128; >> + __u64 reg64; >> + __u32 reg32; >> + __u16 reg16; >> + __u8 reg8; >> + union hv_x64_fp_register fp; >> + union hv_x64_fp_control_status_register fp_control_status; >> + union hv_x64_xmm_control_status_register xmm_control_status; >> + struct hv_x64_segment_register segment; >> + struct hv_x64_table_register table; >> + union hv_explicit_suspend_register explicit_suspend; >> + union hv_intercept_suspend_register intercept_suspend; >> + union hv_dispatch_suspend_register dispatch_suspend; >> + union hv_x64_interrupt_state_register interrupt_state; >> + union hv_x64_pending_interruption_register pending_interruption; >> + union hv_x64_msr_npiep_config_contents npiep_config; >> + union hv_x64_pending_exception_event pending_exception_event; >> + union hv_x64_pending_virtualization_fault_event >> + pending_virtualization_fault_event; >> +}; >> + >> #endif >> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h >> index 6e5072e29897..b9295400c20b 100644 >> --- a/include/asm-generic/hyperv-tlfs.h >> +++ b/include/asm-generic/hyperv-tlfs.h >> @@ -622,53 +622,30 @@ struct hv_retarget_device_interrupt { >> } __packed __aligned(8); >> >> >> -/* HvGetVpRegisters hypercall input with variable size reg name list*/ >> -struct hv_get_vp_registers_input { >> - struct { >> - u64 partitionid; >> - u32 vpindex; >> - u8 inputvtl; >> - u8 padding[3]; >> - } header; >> - struct input { >> - u32 name0; >> - u32 name1; >> - } element[]; >> -} __packed; >> - >> +/* HvGetVpRegisters hypercall with variable size reg name list*/ >> +struct hv_get_vp_registers { >> + u64 partition_id; >> + u32 vp_index; >> + u8 input_vtl; >> + u8 rsvd_z8; >> + u16 rsvd_z16; >> + __aligned(8) enum hv_register_name names[]; >> +} __aligned(8); >> >> -/* HvGetVpRegisters returns an array of these output elements */ >> -struct hv_get_vp_registers_output { >> - union { >> - struct { >> - u32 a; >> - u32 b; >> - u32 c; >> - u32 d; >> - } as32 __packed; >> - struct { >> - u64 low; >> - u64 high; >> - } as64 __packed; >> - }; >> +/* HvSetVpRegisters hypercall with variable size reg name/value list*/ >> +struct hv_register_assoc { >> + enum hv_register_name name; >> + __aligned(16) union hv_register_value value; >> }; >> >> -/* HvSetVpRegisters hypercall with variable size reg name/value list*/ >> -struct hv_set_vp_registers_input { >> - struct { >> - u64 partitionid; >> - u32 vpindex; >> - u8 inputvtl; >> - u8 padding[3]; >> - } header; >> - struct { >> - u32 name; >> - u32 padding1; >> - u64 padding2; >> - u64 valuelow; >> - u64 valuehigh; >> - } element[]; >> -} __packed; >> +struct hv_set_vp_registers { >> + u64 partition_id; >> + u32 vp_index; >> + u8 input_vtl; >> + u8 rsvd_z8; >> + u16 rsvd_z16; >> + struct hv_register_assoc elements[]; >> +} __aligned(16); > > Throughout these structures, I think the approach needs to be more > explicit about the memory layout. The current definitions assume that > the compiler is inserting padding in the expected places, and not in > any unexpected places. My previous concerns about use of enum > also apply. > > The code also removes some layouts that are used in the > not-yet-accepted patches for ARM64. Let sync on how to get > those back in. > I'll add __packed to all these structures. The hv_register_name enum can be replaced by #defines, and the type can be u32 or u64 (it only needs 32 bits). I'll sync with you on the ARM64 structs. >> >> enum hv_device_type { >> HV_DEVICE_TYPE_LOGICAL = 0, >> diff --git a/include/linux/mshv.h b/include/linux/mshv.h >> index 50521c5f7948..dfe469f573f9 100644 >> --- a/include/linux/mshv.h >> +++ b/include/linux/mshv.h >> @@ -17,6 +17,7 @@ >> struct mshv_vp { >> u32 index; >> struct mshv_partition *partition; >> + struct mutex mutex; >> }; >> >> struct mshv_mem_region { >> diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h >> index 1f053eae68a6..5d53ed655429 100644 >> --- a/include/uapi/linux/mshv.h >> +++ b/include/uapi/linux/mshv.h >> @@ -33,6 +33,14 @@ struct mshv_create_vp { >> __u32 vp_index; >> }; >> >> +#define MSHV_VP_MAX_REGISTERS 128 >> + >> +struct mshv_vp_registers { >> + int count; /* at most MSHV_VP_MAX_REGISTERS */ >> + enum hv_register_name *names; >> + union hv_register_value *values; >> +}; > > Having separate arrays for the names and values results in an extra > copy of the data down in the ioctl code. Any reason the caller couldn't > supply the data as an array, where each entry is already a name/value > pair? > I initially thought it would not make a difference to the number of copies, but it turns out it does. I will change it to use hv_register_assoc everywhere. >> + >> #define MSHV_IOCTL 0xB8 >> >> /* mshv device */ >> @@ -44,4 +52,8 @@ struct mshv_create_vp { >> #define MSHV_UNMAP_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x03, struct >> mshv_user_mem_region) >> #define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x04, struct mshv_create_vp) >> >> +/* vp device */ >> +#define MSHV_GET_VP_REGISTERS _IOWR(MSHV_IOCTL, 0x05, struct >> mshv_vp_registers) >> +#define MSHV_SET_VP_REGISTERS _IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers) >> + >> #endif >> diff --git a/virt/mshv/mshv_main.c b/virt/mshv/mshv_main.c >> index 3be9d9a468c1..2a10137a1e84 100644 >> --- a/virt/mshv/mshv_main.c >> +++ b/virt/mshv/mshv_main.c >> @@ -74,6 +74,12 @@ static struct miscdevice mshv_dev = { >> #define HV_MAP_GPA_BATCH_SIZE \ >> (PAGE_SIZE / sizeof(struct hv_map_gpa_pages) / sizeof(u64)) >> #define PIN_PAGES_BATCH_SIZE (0x10000000 / PAGE_SIZE) >> +#define HV_GET_REGISTER_BATCH_SIZE \ >> + (PAGE_SIZE / \ >> + sizeof(struct hv_get_vp_registers) / sizeof(enum hv_register_name)) >> +#define HV_SET_REGISTER_BATCH_SIZE \ >> + (PAGE_SIZE / \ >> + sizeof(struct hv_set_vp_registers) / sizeof(struct hv_register_assoc)) > > These new size calculations have the same bug as HV_MAP_GPA_BATCH_SIZE. > The first divide operations should be subtraction. > Yep, I'll fix it. > With the correct calculation, HV_GET_REGISTER_BATCH_SIZE will be > too large. The input page will accommodate more 32 bit register names > than the output page will accommodate 128 bit register values. The limit > should be based on the latter, not the former. Or calculate both the > input and output limit and use the minimum. > I didn't think about this previously! Will fix. >> >> static int >> hv_call_withdraw_memory(u64 count, int node, u64 partition_id) >> @@ -380,10 +386,258 @@ hv_call_unmap_gpa_pages(u64 partition_id, >> return ret; >> } >> >> +static int >> +hv_call_get_vp_registers(u32 vp_index, >> + u64 partition_id, >> + u16 count, >> + const enum hv_register_name *names, >> + union hv_register_value *values) >> +{ >> + struct hv_get_vp_registers *input_page; >> + union hv_register_value *output_page; >> + u16 completed = 0; >> + u64 hypercall_status; >> + unsigned long remaining = count; >> + int rep_count; >> + int status; >> + unsigned long flags; >> + >> + local_irq_save(flags); >> + >> + input_page = (struct hv_get_vp_registers *)(*this_cpu_ptr( >> + hyperv_pcpu_input_arg)); >> + output_page = (union hv_register_value *)(*this_cpu_ptr( >> + hyperv_pcpu_output_arg)); >> + >> + input_page->partition_id = partition_id; >> + input_page->vp_index = vp_index; >> + input_page->input_vtl = 0; >> + input_page->rsvd_z8 = 0; >> + input_page->rsvd_z16 = 0; >> + >> + while (remaining) { >> + rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE); >> + memcpy(input_page->names, names, >> + sizeof(enum hv_register_name) * rep_count); >> + >> + hypercall_status = >> + hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count, >> + 0, input_page, output_page); >> + status = hypercall_status & HV_HYPERCALL_RESULT_MASK; >> + if (status != HV_STATUS_SUCCESS) { >> + pr_err("%s: completed %li out of %u, %s\n", >> + __func__, >> + count - remaining, count, >> + hv_status_to_string(status)); >> + break; >> + } >> + completed = (hypercall_status & HV_HYPERCALL_REP_COMP_MASK) >> >> + HV_HYPERCALL_REP_COMP_OFFSET; >> + memcpy(values, output_page, >> + sizeof(union hv_register_value) * completed); >> + >> + names += completed; >> + values += completed; >> + remaining -= completed; >> + } >> + local_irq_restore(flags); >> + >> + return -hv_status_to_errno(status); >> +} >> + >> +static int >> +hv_call_set_vp_registers(u32 vp_index, >> + u64 partition_id, >> + u16 count, >> + struct hv_register_assoc *registers) >> +{ >> + struct hv_set_vp_registers *input_page; >> + u16 completed = 0; >> + u64 hypercall_status; >> + unsigned long remaining = count; >> + int rep_count; >> + int status; >> + unsigned long flags; >> + >> + local_irq_save(flags); >> + input_page = (struct hv_set_vp_registers *)(*this_cpu_ptr( >> + hyperv_pcpu_input_arg)); >> + >> + input_page->partition_id = partition_id; >> + input_page->vp_index = vp_index; >> + input_page->input_vtl = 0; >> + input_page->rsvd_z8 = 0; >> + input_page->rsvd_z16 = 0; >> + >> + while (remaining) { >> + rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE); >> + memcpy(input_page->elements, registers, >> + sizeof(struct hv_register_assoc) * rep_count); >> + >> + hypercall_status = >> + hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count, >> + 0, input_page, NULL); >> + status = hypercall_status & HV_HYPERCALL_RESULT_MASK; >> + if (status != HV_STATUS_SUCCESS) { >> + pr_err("%s: completed %li out of %u, %s\n", >> + __func__, >> + count - remaining, count, >> + hv_status_to_string(status)); >> + break; >> + } >> + completed = (hypercall_status & HV_HYPERCALL_REP_COMP_MASK) >> >> + HV_HYPERCALL_REP_COMP_OFFSET; >> + registers += completed; >> + remaining -= completed; >> + } >> + >> + local_irq_restore(flags); >> + >> + return -hv_status_to_errno(status); >> +} >> + >> +static long >> +mshv_vp_ioctl_get_regs(struct mshv_vp *vp, void __user *user_args) >> +{ >> + struct mshv_vp_registers args; >> + enum hv_register_name *names; >> + union hv_register_value *values; >> + long ret; >> + >> + if (copy_from_user(&args, user_args, sizeof(args))) >> + return -EFAULT; >> + >> + if (args.count > MSHV_VP_MAX_REGISTERS) >> + return -EINVAL; >> + >> + names = kmalloc_array(args.count, >> + sizeof(enum hv_register_name), >> + GFP_KERNEL); >> + if (!names) >> + return -ENOMEM; >> + >> + values = kmalloc_array(args.count, >> + sizeof(union hv_register_value), >> + GFP_KERNEL); >> + if (!values) { >> + kfree(names); >> + return -ENOMEM; >> + } >> + >> + if (copy_from_user(names, args.names, >> + sizeof(enum hv_register_name) * args.count)) { >> + ret = -EFAULT; >> + goto free_return; >> + } >> + >> + ret = hv_call_get_vp_registers(vp->index, vp->partition->id, >> + args.count, names, values); >> + if (ret) >> + goto free_return; >> + >> + if (copy_to_user(args.values, values, >> + sizeof(union hv_register_value) * args.count)) { >> + ret = -EFAULT; >> + } >> + >> +free_return: >> + kfree(names); >> + kfree(values); >> + return ret; >> +} >> + >> +static long >> +mshv_vp_ioctl_set_regs(struct mshv_vp *vp, void __user *user_args) >> +{ >> + int i; >> + struct mshv_vp_registers args; >> + struct hv_register_assoc *registers; >> + enum hv_register_name *names; >> + union hv_register_value *values; >> + long ret; >> + >> + if (copy_from_user(&args, user_args, sizeof(args))) >> + return -EFAULT; >> + >> + if (args.count > MSHV_VP_MAX_REGISTERS) >> + return -EINVAL; >> + >> + names = kmalloc_array(args.count, >> + sizeof(enum hv_register_name), >> + GFP_KERNEL); >> + if (!names) >> + return -ENOMEM; >> + >> + values = kmalloc_array(args.count, >> + sizeof(union hv_register_value), >> + GFP_KERNEL); >> + if (!values) { >> + kfree(names); >> + return -ENOMEM; >> + } >> + >> + registers = kmalloc_array(args.count, >> + sizeof(struct hv_register_assoc), >> + GFP_KERNEL); >> + if (!registers) { >> + kfree(values); >> + kfree(names); >> + return -ENOMEM; >> + } >> + >> + if (copy_from_user(names, args.names, >> + sizeof(enum hv_register_name) * args.count)) { >> + ret = -EFAULT; >> + goto free_return; >> + } >> + >> + if (copy_from_user(values, args.values, >> + sizeof(union hv_register_value) * args.count)) { >> + ret = -EFAULT; >> + goto free_return; >> + } >> + >> + for (i = 0; i < args.count; i++) { >> + memcpy(®isters[i].name, &names[i], >> + sizeof(enum hv_register_name)); >> + memcpy(®isters[i].value, &values[i], >> + sizeof(union hv_register_value)); >> + } > > The above will result in uninitialized memory being sent to > Hyper-V, since there is implicit padding associated with the > 32 bit name field. > This shouldn't be an issue after I change this to use hv_register_assoc, instead of separate names and values buffers. >> + >> + ret = hv_call_set_vp_registers(vp->index, vp->partition->id, >> + args.count, registers); >> + >> +free_return: >> + kfree(names); >> + kfree(values); >> + kfree(registers); >> + return ret; >> +} >> + >> + >> static long >> mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) >> { >> - return -ENOTTY; >> + struct mshv_vp *vp = filp->private_data; >> + long r = 0; >> + >> + if (mutex_lock_killable(&vp->mutex)) >> + return -EINTR; >> + >> + switch (ioctl) { >> + case MSHV_GET_VP_REGISTERS: >> + r = mshv_vp_ioctl_get_regs(vp, (void __user *)arg); >> + break; >> + case MSHV_SET_VP_REGISTERS: >> + r = mshv_vp_ioctl_set_regs(vp, (void __user *)arg); >> + break; >> + default: >> + r = -ENOTTY; >> + break; >> + } >> + mutex_unlock(&vp->mutex); >> + >> + return r; >> } >> >> static int >> @@ -420,6 +674,8 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, >> if (!vp) >> return -ENOMEM; >> >> + mutex_init(&vp->mutex); >> + >> vp->index = args.vp_index; >> vp->partition = mshv_partition_get(partition); >> if (!vp->partition) { >> -- >> 2.25.1