On Mon, Oct 29, 2018 at 06:54:50PM +0000, Roman Kagan <rkagan@xxxxxxxxxxxxx> wrote: > On Wed, Oct 24, 2018 at 09:48:26PM -0700, Isaku Yamahata wrote: > > hyper-v hypercall supports xmm fast hypercall > > where argument is exchanged though regular/xmm registers. > > This patch implements them and make use of them. > > With this patch, hyperv/hv_apic.c and hyperv/mmu.c will use (xmm) fast > > hypercall. > > > > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > > --- > > arch/x86/hyperv/mmu.c | 4 +- > > arch/x86/hyperv/nested.c | 2 +- > > arch/x86/include/asm/hyperv-tlfs.h | 3 + > > arch/x86/include/asm/mshyperv.h | 176 ++++++++++++++++++++++++++++++++++-- > > drivers/hv/hv.c | 3 +- > > drivers/pci/controller/pci-hyperv.c | 7 +- > > 6 files changed, 179 insertions(+), 16 deletions(-) > > > > diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c > > index ef5f29f913d7..41820372bb3d 100644 > > --- a/arch/x86/hyperv/mmu.c > > +++ b/arch/x86/hyperv/mmu.c > > @@ -134,11 +134,11 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, > > if (info->end == TLB_FLUSH_ALL) { > > flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; > > status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, > > - flush, NULL); > > + flush, sizeof(*flush), NULL, 0); > > } else if (info->end && > > ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { > > status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, > > - flush, NULL); > > + flush, sizeof(*flush), NULL, 0); > > } else { > > gva_n = fill_gva_list(flush->gva_list, 0, > > info->start, info->end); > > diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c > > index b8e60cc50461..5fd24f4f2ae3 100644 > > --- a/arch/x86/hyperv/nested.c > > +++ b/arch/x86/hyperv/nested.c > > @@ -43,7 +43,7 @@ int hyperv_flush_guest_mapping(u64 as) > > flush->flags = 0; > > > > status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, > > - flush, NULL); > > + flush, sizeof(*flush), NULL, 0); > > local_irq_restore(flags); > > > > if (!(status & HV_HYPERCALL_RESULT_MASK)) > > diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h > > index 00e01d215f74..d80e0151b790 100644 > > --- a/arch/x86/include/asm/hyperv-tlfs.h > > +++ b/arch/x86/include/asm/hyperv-tlfs.h > > @@ -123,6 +123,7 @@ > > * registers is available > > */ > > #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) > > +#define HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE (1 << 15) > > /* Support for a virtual guest idle state is available */ > > #define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) > > /* Guest crash data handler available */ > > @@ -383,10 +384,12 @@ enum HV_GENERIC_SET_FORMAT { > > #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) > > #define HV_HYPERCALL_FAST_BIT BIT(16) > > #define HV_HYPERCALL_VARHEAD_OFFSET 17 > > +#define HV_HYPERCALL_VARHEAD_MASK GENMASK_ULL(26, 17) > > #define HV_HYPERCALL_REP_COMP_OFFSET 32 > > #define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) > > #define HV_HYPERCALL_REP_START_OFFSET 48 > > #define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) > > +#define HV_XMM_BYTE_MAX 112 > > > > /* hypercall status code */ > > #define HV_STATUS_SUCCESS 0 > > diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h > > index f37704497d8f..5d8acb00ab94 100644 > > --- a/arch/x86/include/asm/mshyperv.h > > +++ b/arch/x86/include/asm/mshyperv.h > > @@ -132,16 +132,13 @@ extern struct clocksource *hyperv_cs; > > extern void *hv_hypercall_pg; > > extern void __percpu **hyperv_pcpu_input_arg; > > > > -static inline u64 hv_do_hypercall(u64 control, void *input, void *output) > > +static inline u64 __hv_do_hypercall(u64 control, void *input, void *output) > > As some of the hypercalls either don't support or don't need to use fast > xmm scheme I'd rather keep this function name as is and introduce a new > one for fast xmm. Okay. Anyway I don't strong opinion here. So far I found only HVCALL_POST_MESSAGE doesn't support fast hypercall. It's not clear from the TLFS spec. > > { > > u64 input_address = input ? virt_to_phys(input) : 0; > > u64 output_address = output ? virt_to_phys(output) : 0; > > u64 hv_status; > > > > #ifdef CONFIG_X86_64 > > - if (!hv_hypercall_pg) > > - return U64_MAX; > > - > > Later in this patch you still use this function directly, so you can't > drop this test. will fix it. > > __asm__ __volatile__("mov %4, %%r8\n" > > CALL_NOSPEC > > : "=a" (hv_status), ASM_CALL_CONSTRAINT, > > @@ -155,9 +152,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) > > u32 output_address_hi = upper_32_bits(output_address); > > u32 output_address_lo = lower_32_bits(output_address); > > > > - if (!hv_hypercall_pg) > > - return U64_MAX; > > - > > __asm__ __volatile__(CALL_NOSPEC > > : "=A" (hv_status), > > "+c" (input_address_lo), ASM_CALL_CONSTRAINT > > @@ -201,7 +195,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) > > return hv_status; > > } > > > > -/* Fast hypercall with 16 bytes of input */ > > +/* Fast hypercall with 16 bytes of input and no output */ > > static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) > > { > > u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; > > @@ -246,11 +240,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, > > u64 status; > > u16 rep_comp; > > > > + if (unlikely(!hv_hypercall_pg)) > > + return U64_MAX; > > + > > control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; > > control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; > > > > do { > > - status = hv_do_hypercall(control, input, output); > > + status = __hv_do_hypercall(control, input, output); > > if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) > > return status; > > > > @@ -267,6 +264,167 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, > > return status; > > } > > > > +/* ibytes = fixed header size + var header size + data size in bytes */ > > +static inline u64 hv_do_xmm_fast_hypercall( > > + u32 varhead_code, void *input, size_t ibytes, > > + void *output, size_t obytes) > > +{ > > + u64 control = (u64)varhead_code | HV_HYPERCALL_FAST_BIT; > > + u64 hv_status; > > + u64 input1; > > + u64 input2; > > + size_t i_end = roundup(ibytes, 16); > > + size_t o_end = i_end + roundup(obytes, 16); > > + u64 *ixmm = (u64 *)input + 2; > > + u64 tmp[(o_end - 16) / 8] __aligned((16)); > > + > > + BUG_ON(i_end <= 16); > > + BUG_ON(o_end > HV_XMM_BYTE_MAX); > > + BUG_ON(!IS_ALIGNED((unsigned long)input, 16)); > > + BUG_ON(!IS_ALIGNED((unsigned long)output, 16)); > > + > > + /* it's assumed that there are at least two inputs */ > > + input1 = ((u64 *)input)[0]; > > + input2 = ((u64 *)input)[1]; > > + > > + preempt_disable(); > > Don't you rather need kernel_fpu_begin() here (paired with > kernel_fpu_end() at the end)? This may affect your benchmark results > noticably. You're right. For that reason, it's intentional to NOT use kernel_fpu_begin/end() for that reason. I'll add a comment on it. > > + if (o_end > 2 * 8) > > + __asm__ __volatile__("movdqa %%xmm0, %0" : : "m" (tmp[0])); > > + if (o_end > 4 * 8) > > + __asm__ __volatile__("movdqa %%xmm1, %0" : : "m" (tmp[2])); > > + if (o_end > 6 * 8) > > + __asm__ __volatile__("movdqa %%xmm2, %0" : : "m" (tmp[4])); > > + if (o_end > 8 * 8) > > + __asm__ __volatile__("movdqa %%xmm3, %0" : : "m" (tmp[6])); > > + if (o_end > 10 * 8) > > + __asm__ __volatile__("movdqa %%xmm4, %0" : : "m" (tmp[8])); > > + if (o_end > 12 * 8) > > + __asm__ __volatile__("movdqa %%xmm5, %0" : : "m" (tmp[10])); > > + if (ibytes > 2 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (ixmm[0])); > > + if (ibytes > 4 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (ixmm[2])); > > + if (ibytes > 6 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (ixmm[4])); > > + if (ibytes > 8 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (ixmm[6])); > > + if (ibytes > 10 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (ixmm[8])); > > + if (ibytes > 12 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (ixmm[10])); > > + > > +#ifdef CONFIG_X86_64 > > + __asm__ __volatile__("mov %4, %%r8\n" > > + CALL_NOSPEC > > + : "=a" (hv_status), ASM_CALL_CONSTRAINT, > > + "+c" (control), "+d" (input1) > > + : "r" (input2), > > + THUNK_TARGET(hv_hypercall_pg) > > + : "cc", "memory", "r8", "r9", "r10", "r11"); > > +#else > > + { > > + u32 input1_hi = upper_32_bits(input1); > > + u32 input1_lo = lower_32_bits(input1); > > + u32 input2_hi = upper_32_bits(input2); > > + u32 input2_lo = lower_32_bits(input2); > > + > > + __asm__ __volatile__ (CALL_NOSPEC > > + : "=A"(hv_status), > > + "+c"(input1_lo), ASM_CALL_CONSTRAINT > > + : "A" (control), "b" (input1_hi), > > + "D"(input2_hi), "S"(input2_lo), > > + THUNK_TARGET(hv_hypercall_pg) > > + : "cc", "memory"); > > + } > > +#endif > > + if (output) { > > + u64 *oxmm = (u64 *)output; > > + if (i_end <= 2 * 8 && 2 * 8 < o_end) { > > + __asm__ __volatile__( > > + "movdqa %%xmm0, %0" : "=m" (oxmm[0])); > > + oxmm += 2; > > + } > > + if (i_end <= 4 * 8 && 4 * 8 < o_end) { > > + __asm__ __volatile__( > > + "movdqa %%xmm1, %0" : "=m" (oxmm[0])); > > + oxmm += 2; > > + } > > + if (i_end <= 6 * 8 && 6 * 8 < o_end) { > > + __asm__ __volatile__( > > + "movdqa %%xmm2, %0" : "=m" (oxmm[0])); > > + oxmm += 2; > > + } > > + if (i_end <= 8 * 8 && 8 * 8 < o_end) { > > + __asm__ __volatile__( > > + "movdqa %%xmm3, %0" : "=m" (oxmm[0])); > > + oxmm += 2; > > + } > > + if (i_end <= 10 * 8 && 10 * 8 < o_end) { > > + __asm__ __volatile__( > > + "movdqa %%xmm4, %0" : "=m" (oxmm[0])); > > + oxmm += 2; > > + } > > + if (i_end <= 12 * 8 && 12 * 8 < o_end) { > > + __asm__ __volatile__( > > + "movdqa %%xmm5, %0" : "=m" (oxmm[0])); > > + oxmm += 2; > > + } > > + } > > + if (o_end > 2 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (tmp[0])); > > + if (o_end > 4 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (tmp[2])); > > + if (o_end > 6 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (tmp[4])); > > + if (o_end > 8 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (tmp[6])); > > + if (o_end > 10 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (tmp[8])); > > + if (o_end > 12 * 8) > > + __asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (tmp[10])); > > + preempt_enable(); > > + > > + return hv_status; > > +} > > + > > +static inline u64 hv_do_hypercall( > > + u32 varhead_code, > > + void *input, size_t ibytes, void *output, size_t obytes) > > +{ > > + if (unlikely(!hv_hypercall_pg)) > > + return U64_MAX; > > + > > + /* fast hypercall */ > > + if (output == NULL && ibytes <= 16) { > > + u64 *i = (u64*)input; > > + > > + WARN_ON((varhead_code & HV_HYPERCALL_VARHEAD_MASK) != 0); > > + if (ibytes <= 8) > > + return hv_do_fast_hypercall8((u16)varhead_code, i[0]); > > + > > + return hv_do_fast_hypercall16((u16)varhead_code, i[0], i[1]); > > + } > > + > > + /* xmm fast hypercall */ > > + if (static_cpu_has(X86_FEATURE_XMM) && > > + ms_hyperv.features & HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE && > > + roundup(ibytes, 16) + obytes <= HV_XMM_BYTE_MAX) { > > + if (output) { > > + if (ms_hyperv.features & > > + HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE) > > + return hv_do_xmm_fast_hypercall( > > + varhead_code, input, ibytes, > > + output, obytes); > > + } else { > > + WARN_ON(obytes > 0); > > + return hv_do_xmm_fast_hypercall( > > + varhead_code, input, ibytes, NULL, 0); > > + } > > + } > > + > > + return __hv_do_hypercall((u64)varhead_code, input, output); > > +} > > + > > /* > > * Hypervisor's notion of virtual processor ID is different from > > * Linux' notion of CPU ID. This information can only be retrieved > > diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c > > index 748a1c4172a6..b80293861c54 100644 > > --- a/drivers/hv/hv.c > > +++ b/drivers/hv/hv.c > > @@ -92,7 +92,8 @@ int hv_post_message(union hv_connection_id connection_id, > > aligned_msg->payload_size = payload_size; > > memcpy((void *)aligned_msg->payload, payload, payload_size); > > > > - status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL); > > + /* fast hypercall doesn't seem supported */ > > + status = __hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL); > > > > /* Preemption must remain disabled until after the hypercall > > * so some other thread can't get scheduled onto this cpu and > > diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c > > index ee80e79db21a..ea4aab9a6d1c 100644 > > --- a/drivers/pci/controller/pci-hyperv.c > > +++ b/drivers/pci/controller/pci-hyperv.c > > @@ -461,7 +461,7 @@ struct hv_pcibus_device { > > struct irq_domain *irq_domain; > > > > /* hypercall arg, must not cross page boundary */ > > - struct retarget_msi_interrupt retarget_msi_interrupt_params; > > + __attribute__((__aligned__(16))) struct retarget_msi_interrupt retarget_msi_interrupt_params; > > > > spinlock_t retarget_msi_interrupt_lock; > > > > @@ -984,8 +984,9 @@ static void hv_irq_unmask(struct irq_data *data) > > } > > } > > > > - res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), > > - params, NULL); > > + res = hv_do_hypercall( > > + HVCALL_RETARGET_INTERRUPT | (var_size << 17), > > + params, sizeof(*params) + var_size * 8, NULL, 0); > > This probably isn't performance-critical and can be left as is. > (Frankly I'm struggling to understand why this has to be a hypercall at > all.) If interrupt is pending, this hpyercall gives VMM a chance to inject interrupt into VM. thanks, -- Isaku Yamahata <isaku.yamahata@xxxxxxxxx>