On 2/8/2021 11:48 AM, Michael Kelley wrote: > From: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx> Sent: Friday, November 20, 2020 4:31 PM >> To: linux-hyperv@xxxxxxxxxxxxxxx >> Cc: virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Michael Kelley >> <mikelley@xxxxxxxxxxxxx>; viremana@xxxxxxxxxxxxxxxxxxx; Sunil Muthuswamy >> <sunilmut@xxxxxxxxxxxxx>; nunodasneves@xxxxxxxxxxxxxxxxxxx; wei.liu@xxxxxxxxxx; >> Lillian Grassin-Drake <Lillian.GrassinDrake@xxxxxxxxxxxxx>; KY Srinivasan >> <kys@xxxxxxxxxxxxx> >> Subject: [RFC PATCH 15/18] virt/mshv: get and set vp state ioctls >> >> Introduce ioctls for getting and setting guest vcpu emulated LAPIC >> state, and xsave data. >> >> Signed-off-by: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx> >> --- >> Documentation/virt/mshv/api.rst | 8 + >> arch/x86/include/uapi/asm/hyperv-tlfs.h | 59 ++++++ >> include/asm-generic/hyperv-tlfs.h | 41 ++++ >> include/uapi/asm-generic/hyperv-tlfs.h | 28 +++ >> include/uapi/linux/mshv.h | 13 ++ >> virt/mshv/mshv_main.c | 262 ++++++++++++++++++++++++ >> 6 files changed, 411 insertions(+) >> >> diff --git a/Documentation/virt/mshv/api.rst b/Documentation/virt/mshv/api.rst >> index 694f978131f9..7fd75f248eff 100644 >> --- a/Documentation/virt/mshv/api.rst >> +++ b/Documentation/virt/mshv/api.rst >> @@ -140,4 +140,12 @@ Assert interrupts in partitions that use Microsoft Hypervisor's >> internal >> emulated LAPIC. This must be enabled on partition creation with the flag: >> HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED >> >> +3.9 MSHV_GET_VP_STATE and MSHV_SET_VP_STATE >> +-------------------------- >> +:Type: vp ioctl >> +:Parameters: struct mshv_vp_state >> +:Returns: 0 on success >> + >> +Get/set various vp state. Currently these can be used to get and set >> +emulated LAPIC state, and xsave data. >> >> diff --git a/arch/x86/include/uapi/asm/hyperv-tlfs.h b/arch/x86/include/uapi/asm/hyperv- >> tlfs.h >> index 5478d4943bfc..78758aedf23e 100644 >> --- a/arch/x86/include/uapi/asm/hyperv-tlfs.h >> +++ b/arch/x86/include/uapi/asm/hyperv-tlfs.h >> @@ -1051,4 +1051,63 @@ union hv_interrupt_control { >> __u64 as_uint64; >> }; >> >> +struct hv_local_interrupt_controller_state { >> + __u32 apic_id; >> + __u32 apic_version; >> + __u32 apic_ldr; >> + __u32 apic_dfr; >> + __u32 apic_spurious; >> + __u32 apic_isr[8]; >> + __u32 apic_tmr[8]; >> + __u32 apic_irr[8]; >> + __u32 apic_esr; >> + __u32 apic_icr_high; >> + __u32 apic_icr_low; >> + __u32 apic_lvt_timer; >> + __u32 apic_lvt_thermal; >> + __u32 apic_lvt_perfmon; >> + __u32 apic_lvt_lint0; >> + __u32 apic_lvt_lint1; >> + __u32 apic_lvt_error; >> + __u32 apic_lvt_cmci; >> + __u32 apic_error_status; >> + __u32 apic_initial_count; >> + __u32 apic_counter_value; >> + __u32 apic_divide_configuration; >> + __u32 apic_remote_read; >> +}; >> + >> +#define HV_XSAVE_DATA_NO_XMM_REGISTERS 1 >> + >> +union hv_x64_xsave_xfem_register { >> + __u64 as_uint64; >> + struct { >> + __u32 low_uint32; >> + __u32 high_uint32; >> + }; >> + struct { >> + __u64 legacy_x87: 1; >> + __u64 legacy_sse: 1; >> + __u64 avx: 1; >> + __u64 mpx_bndreg: 1; >> + __u64 mpx_bndcsr: 1; >> + __u64 avx_512_op_mask: 1; >> + __u64 avx_512_zmmhi: 1; >> + __u64 avx_512_zmm16_31: 1; >> + __u64 rsvd8_9: 2; >> + __u64 pasid: 1; >> + __u64 cet_u: 1; >> + __u64 cet_s: 1; >> + __u64 rsvd13_16: 4; >> + __u64 xtile_cfg: 1; >> + __u64 xtile_data: 1; >> + __u64 rsvd19_63: 45; >> + }; >> +}; >> + >> +struct hv_vp_state_data_xsave { >> + __u64 flags; >> + union hv_x64_xsave_xfem_register states; >> +}; >> + >> #endif >> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h >> index 2cd46241c545..4bc59a0344ce 100644 >> --- a/include/asm-generic/hyperv-tlfs.h >> +++ b/include/asm-generic/hyperv-tlfs.h >> @@ -167,6 +167,9 @@ struct ms_hyperv_tsc_page { >> #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 >> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af >> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 >> +#define HVCALL_MAP_VP_STATE_PAGE 0x00e1 >> +#define HVCALL_GET_VP_STATE 0x00e3 >> +#define HVCALL_SET_VP_STATE 0x00e4 >> >> #define HV_FLUSH_ALL_PROCESSORS BIT(0) >> #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) >> @@ -796,4 +799,42 @@ struct hv_assert_virtual_interrupt { >> u16 rsvd_z1; >> }; >> >> +struct hv_vp_state_data { >> + enum hv_get_set_vp_state_type type; >> + u32 rsvd; >> + struct hv_vp_state_data_xsave xsave; >> + >> +}; >> + >> +struct hv_get_vp_state_in { >> + u64 partition_id; >> + u32 vp_index; >> + u8 input_vtl; >> + u8 rsvd0; >> + u16 rsvd1; >> + struct hv_vp_state_data state_data; >> + u64 output_data_pfns[]; >> +}; >> + >> +union hv_get_vp_state_out { >> + struct hv_local_interrupt_controller_state interrupt_controller_state; >> + /* Not supported yet */ >> + /* struct hv_synthetic_timers_state synthetic_timers_state; */ >> +}; >> + >> +union hv_input_set_vp_state_data { >> + u64 pfns; >> + u8 bytes; >> +}; >> + >> +struct hv_set_vp_state_in { >> + u64 partition_id; >> + u32 vp_index; >> + u8 input_vtl; >> + u8 rsvd0; >> + u16 rsvd1; >> + struct hv_vp_state_data state_data; >> + union hv_input_set_vp_state_data data[]; >> +}; >> + >> #endif >> diff --git a/include/uapi/asm-generic/hyperv-tlfs.h b/include/uapi/asm-generic/hyperv- >> tlfs.h >> index e87389054b68..b3c84c69b73f 100644 >> --- a/include/uapi/asm-generic/hyperv-tlfs.h >> +++ b/include/uapi/asm-generic/hyperv-tlfs.h >> @@ -64,4 +64,32 @@ struct hv_message { >> #define HV_MAP_GPA_EXECUTABLE 0xC >> #define HV_MAP_GPA_PERMISSIONS_MASK 0xF >> >> +/* >> + * For getting and setting VP state, there are two options based on the state type: >> + * >> + * 1.) Data that is accessed by PFNs in the input hypercall page. This is used >> + * for state which may not fit into the hypercall pages. >> + * 2.) Data that is accessed directly in the input\output hypercall pages. >> + * This is used for state that will always fit into the hypercall pages. >> + * >> + * In the future this could be dynamic based on the size if needed. >> + * >> + * Note these hypercalls have an 8-byte aligned variable header size as per the tlfs >> + */ >> + >> +#define HV_GET_SET_VP_STATE_TYPE_PFN BIT(31) >> + >> +enum hv_get_set_vp_state_type { >> + HV_GET_SET_VP_STATE_LOCAL_INTERRUPT_CONTROLLER_STATE = 0, >> + >> + HV_GET_SET_VP_STATE_XSAVE = 1 | >> HV_GET_SET_VP_STATE_TYPE_PFN, >> + /* Synthetic message page */ >> + HV_GET_SET_VP_STATE_SIM_PAGE = 2 | >> HV_GET_SET_VP_STATE_TYPE_PFN, >> + /* Synthetic interrupt event flags page. */ >> + HV_GET_SET_VP_STATE_SIEF_PAGE = 3 | >> HV_GET_SET_VP_STATE_TYPE_PFN, >> + >> + /* Synthetic timers. */ >> + HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS = 4, >> +}; >> + >> #endif >> diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h >> index faed9d065bb7..ae0bb64bbec3 100644 >> --- a/include/uapi/linux/mshv.h >> +++ b/include/uapi/linux/mshv.h >> @@ -53,6 +53,17 @@ struct mshv_assert_interrupt { >> __u32 vector; >> }; >> >> +struct mshv_vp_state { >> + enum hv_get_set_vp_state_type type; >> + struct hv_vp_state_data_xsave xsave; /* only for xsave request */ >> + >> + __u64 buf_size; /* If xsave, must be page-aligned */ >> + union { >> + struct hv_local_interrupt_controller_state *lapic; >> + __u8 *bytes; /* Xsave data. must be page-aligned */ >> + } buf; >> +}; >> + >> #define MSHV_IOCTL 0xB8 >> >> /* mshv device */ >> @@ -70,5 +81,7 @@ struct mshv_assert_interrupt { >> #define MSHV_GET_VP_REGISTERS _IOWR(MSHV_IOCTL, 0x05, struct >> mshv_vp_registers) >> #define MSHV_SET_VP_REGISTERS _IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers) >> #define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x07, struct hv_message) >> +#define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x0A, struct mshv_vp_state) >> +#define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x0B, struct mshv_vp_state) >> >> #endif >> diff --git a/virt/mshv/mshv_main.c b/virt/mshv/mshv_main.c >> index 9cf236ade50a..70172d9488de 100644 >> --- a/virt/mshv/mshv_main.c >> +++ b/virt/mshv/mshv_main.c >> @@ -864,6 +864,262 @@ mshv_vp_ioctl_set_regs(struct mshv_vp *vp, void __user >> *user_args) >> return ret; >> } >> >> +static int >> +hv_call_get_vp_state(u32 vp_index, >> + u64 partition_id, >> + enum hv_get_set_vp_state_type type, >> + struct hv_vp_state_data_xsave xsave, >> + /* Choose between pages and ret_output */ >> + u64 page_count, >> + struct page **pages, >> + union hv_get_vp_state_out *ret_output) >> +{ >> + struct hv_get_vp_state_in *input; >> + union hv_get_vp_state_out *output; >> + int status; >> + int i; >> + u64 control; >> + unsigned long flags; >> + int ret = 0; >> + >> + if (sizeof(*input) + (page_count * sizeof(u64)) > PAGE_SIZE) >> + return -EINVAL; > > Nit: Stylistically, you are handling this differently from the BATCH_SIZE > macros, which are essentially doing the same thing of calculating > how many entries will fit in the input page. Note to use > HV_HYP_PAGE_SIZE. > Hmm, I didn't notice this. I guess it's ok either way, but for consistency I will add: #define HV_GET_VP_STATE_BATCH_SIZE ((HV_HYP_PAGE_SIZE - sizeof(struct hv_get_vp_state_in)) / sizeof(u64)) And change the condition to: if (page_count > HV_GET_VP_STATE_BATCH_SIZE) >> + >> + if (!page_count && !ret_output) >> + return -EINVAL; >> + >> + do { >> + local_irq_save(flags); >> + input = (struct hv_get_vp_state_in *) >> + (*this_cpu_ptr(hyperv_pcpu_input_arg)); >> + output = (union hv_get_vp_state_out *) >> + (*this_cpu_ptr(hyperv_pcpu_output_arg)); >> + memset(input, 0, sizeof(*input)); >> + memset(output, 0, sizeof(*output)); >> + >> + input->partition_id = partition_id; >> + input->vp_index = vp_index; >> + input->state_data.type = type; >> + memcpy(&input->state_data.xsave, &xsave, sizeof(xsave)); >> + for (i = 0; i < page_count; i++) >> + input->output_data_pfns[i] = >> + page_to_pfn(pages[i]) & HV_MAP_GPA_MASK; >> + >> + control = (HVCALL_GET_VP_STATE) | >> + (page_count << HV_HYPERCALL_VARHEAD_OFFSET); >> + >> + status = hv_do_hypercall(control, input, output) & >> + HV_HYPERCALL_RESULT_MASK; >> + >> + if (status != HV_STATUS_INSUFFICIENT_MEMORY) { >> + if (status != HV_STATUS_SUCCESS) >> + pr_err("%s: %s\n", __func__, >> + hv_status_to_string(status)); >> + else if (ret_output) >> + memcpy(ret_output, output, sizeof(*output)); >> + >> + local_irq_restore(flags); >> + ret = -hv_status_to_errno(status); >> + break; >> + } >> + local_irq_restore(flags); >> + >> + ret = hv_call_deposit_pages(NUMA_NO_NODE, >> + partition_id, 1); >> + } while (!ret); >> + >> + return ret; >> +} >> + >> +static int >> +hv_call_set_vp_state(u32 vp_index, >> + u64 partition_id, >> + enum hv_get_set_vp_state_type type, >> + struct hv_vp_state_data_xsave xsave, >> + /* Choose between pages and bytes */ >> + u64 page_count, >> + struct page **pages, >> + u32 num_bytes, >> + u8 *bytes) >> +{ >> + struct hv_set_vp_state_in *input; >> + int status; >> + int i; >> + u64 control; >> + unsigned long flags; >> + int ret = 0; >> + u16 varhead_sz; >> + >> + if (sizeof(*input) + (page_count * sizeof(u64)) > PAGE_SIZE) > > Same comment as above. > I'll do the same as above. >> + return -EINVAL; >> + if (sizeof(*input) + num_bytes > PAGE_SIZE) > > Use HV_HYP_PAGE_SIZE. > Will do. >> + return -EINVAL; >> + >> + if (num_bytes) >> + /* round up to 8 and divide by 8 */ >> + varhead_sz = (num_bytes + 7) >> 3; >> + else if (page_count) >> + varhead_sz = page_count; >> + else >> + return -EINVAL; >> + >> + do { >> + local_irq_save(flags); >> + input = (struct hv_set_vp_state_in *) >> + (*this_cpu_ptr(hyperv_pcpu_input_arg)); >> + memset(input, 0, sizeof(*input)); >> + >> + input->partition_id = partition_id; >> + input->vp_index = vp_index; >> + input->state_data.type = type; >> + memcpy(&input->state_data.xsave, &xsave, sizeof(xsave)); >> + if (num_bytes) { >> + memcpy((u8 *)input->data, bytes, num_bytes); >> + } else { >> + for (i = 0; i < page_count; i++) >> + input->data[i].pfns = >> + page_to_pfn(pages[i]) & HV_MAP_GPA_MASK; > > Same comment as in earlier patch about GPA_MASK. Also, this doesn't work > if PAGE_SIZE != HV_HYP_PAGE_SIZE, though it may be fine to not handle that case > for now. > Will remove the mask. As before, won't handle PAGE_SIZE != HV_HYP_PAGE_SIZE in this patch set. >> + } >> + >> + control = (HVCALL_SET_VP_STATE) | >> + (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET); >> + >> + status = hv_do_hypercall(control, input, NULL) & >> + HV_HYPERCALL_RESULT_MASK; >> + >> + if (status != HV_STATUS_INSUFFICIENT_MEMORY) { >> + if (status != HV_STATUS_SUCCESS) >> + pr_err("%s: %s\n", __func__, >> + hv_status_to_string(status)); >> + >> + local_irq_restore(flags); >> + ret = -hv_status_to_errno(status); >> + break; >> + } >> + local_irq_restore(flags); >> + >> + ret = hv_call_deposit_pages(NUMA_NO_NODE, >> + partition_id, 1); >> + } while (!ret); >> + >> + return ret; >> +} >> + >> +static long >> +mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, >> + struct mshv_vp_state *args, >> + bool is_set) >> +{ >> + u64 page_count, remaining; >> + int completed; >> + struct page **pages; >> + long ret; >> + unsigned long u_buf; >> + >> + /* Buffer must be page aligned */ >> + if (args->buf_size & (PAGE_SIZE - 1) || >> + (u64)args->buf.bytes & (PAGE_SIZE - 1)) >> + return -EINVAL; > > Use PAGE_ALIGNED macro. > Will do. >> + >> + if (!access_ok(args->buf.bytes, args->buf_size)) >> + return -EFAULT; >> + >> + /* Pin user pages so hypervisor can copy directly to them */ >> + page_count = args->buf_size >> PAGE_SHIFT; >> + pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); >> + if (!pages) >> + return -ENOMEM; >> + >> + remaining = page_count; >> + u_buf = (unsigned long)args->buf.bytes; >> + while (remaining) { >> + completed = pin_user_pages_fast( >> + u_buf, >> + remaining, >> + FOLL_WRITE, >> + &pages[page_count - remaining]); >> + if (completed < 0) { >> + pr_err("%s: failed to pin user pages error %i\n", >> + __func__, completed); >> + ret = completed; >> + goto unpin_pages; >> + } >> + remaining -= completed; >> + u_buf += completed * PAGE_SIZE; >> + } >> + >> + if (is_set) >> + ret = hv_call_set_vp_state(vp->index, >> + vp->partition->id, >> + args->type, args->xsave, >> + page_count, pages, >> + 0, NULL); >> + else >> + ret = hv_call_get_vp_state(vp->index, >> + vp->partition->id, >> + args->type, args->xsave, >> + page_count, pages, >> + NULL); >> + >> +unpin_pages: >> + unpin_user_pages(pages, page_count - remaining); >> + kfree(pages); >> + return ret; >> +} >> + >> +static long >> +mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, void __user *user_args, bool is_set) >> +{ >> + struct mshv_vp_state args; >> + long ret = 0; >> + union hv_get_vp_state_out vp_state; >> + >> + if (copy_from_user(&args, user_args, sizeof(args))) >> + return -EFAULT; >> + >> + /* For now just support these */ >> + if (args.type != HV_GET_SET_VP_STATE_LOCAL_INTERRUPT_CONTROLLER_STATE && >> + args.type != HV_GET_SET_VP_STATE_XSAVE) >> + return -EINVAL; >> + >> + /* If we need to pin pfns, delegate to helper */ >> + if (args.type & HV_GET_SET_VP_STATE_TYPE_PFN) >> + return mshv_vp_ioctl_get_set_state_pfn(vp, &args, is_set); >> + >> + if (args.buf_size < sizeof(vp_state)) >> + return -EINVAL; >> + >> + if (is_set) { >> + if (copy_from_user( >> + &vp_state, >> + args.buf.lapic, >> + sizeof(vp_state))) >> + return -EFAULT; >> + >> + return hv_call_set_vp_state(vp->index, >> + vp->partition->id, >> + args.type, args.xsave, >> + 0, NULL, >> + sizeof(vp_state), >> + (u8 *)&vp_state); >> + } >> + >> + ret = hv_call_get_vp_state(vp->index, >> + vp->partition->id, >> + args.type, args.xsave, >> + 0, NULL, >> + &vp_state); >> + >> + if (ret) >> + return ret; >> + >> + if (copy_to_user(args.buf.lapic, >> + &vp_state.interrupt_controller_state, >> + sizeof(vp_state.interrupt_controller_state))) >> + return -EFAULT; >> + >> + return 0; >> +} >> >> static long >> mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) >> @@ -884,6 +1140,12 @@ mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long >> arg) >> case MSHV_SET_VP_REGISTERS: >> r = mshv_vp_ioctl_set_regs(vp, (void __user *)arg); >> break; >> + case MSHV_GET_VP_STATE: >> + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); >> + break; >> + case MSHV_SET_VP_STATE: >> + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); >> + break; >> default: >> r = -ENOTTY; >> break; >> -- >> 2.25.1