Re: [PATCH v2 1/6] x86/kernel/hyper-v: xmm fast hypercall as guest

Isaku Yamahata <isaku.yamahata@xxxxxxxxx> · Mon, 29 Oct 2018 19:33:43 -0700

On Mon, Oct 29, 2018 at 06:54:50PM +0000,
Roman Kagan <rkagan@xxxxxxxxxxxxx> wrote:

> On Wed, Oct 24, 2018 at 09:48:26PM -0700, Isaku Yamahata wrote:
> > hyper-v hypercall supports xmm fast hypercall
> > where argument is exchanged though regular/xmm registers.
> > This patch implements them and make use of them.
> > With this patch, hyperv/hv_apic.c and hyperv/mmu.c will use (xmm) fast
> > hypercall.
> > 
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> > ---
> >  arch/x86/hyperv/mmu.c               |   4 +-
> >  arch/x86/hyperv/nested.c            |   2 +-
> >  arch/x86/include/asm/hyperv-tlfs.h  |   3 +
> >  arch/x86/include/asm/mshyperv.h     | 176 ++++++++++++++++++++++++++++++++++--
> >  drivers/hv/hv.c                     |   3 +-
> >  drivers/pci/controller/pci-hyperv.c |   7 +-
> >  6 files changed, 179 insertions(+), 16 deletions(-)
> > 
> > diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
> > index ef5f29f913d7..41820372bb3d 100644
> > --- a/arch/x86/hyperv/mmu.c
> > +++ b/arch/x86/hyperv/mmu.c
> > @@ -134,11 +134,11 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
> >  	if (info->end == TLB_FLUSH_ALL) {
> >  		flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> >  		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> > -					 flush, NULL);
> > +					 flush, sizeof(*flush), NULL, 0);
> >  	} else if (info->end &&
> >  		   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
> >  		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> > -					 flush, NULL);
> > +					 flush, sizeof(*flush), NULL, 0);
> >  	} else {
> >  		gva_n = fill_gva_list(flush->gva_list, 0,
> >  				      info->start, info->end);
> > diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
> > index b8e60cc50461..5fd24f4f2ae3 100644
> > --- a/arch/x86/hyperv/nested.c
> > +++ b/arch/x86/hyperv/nested.c
> > @@ -43,7 +43,7 @@ int hyperv_flush_guest_mapping(u64 as)
> >  	flush->flags = 0;
> >  
> >  	status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE,
> > -				 flush, NULL);
> > +				 flush, sizeof(*flush), NULL, 0);
> >  	local_irq_restore(flags);
> >  
> >  	if (!(status & HV_HYPERCALL_RESULT_MASK))
> > diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
> > index 00e01d215f74..d80e0151b790 100644
> > --- a/arch/x86/include/asm/hyperv-tlfs.h
> > +++ b/arch/x86/include/asm/hyperv-tlfs.h
> > @@ -123,6 +123,7 @@
> >   * registers is available
> >   */
> >  #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4)
> > +#define HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE		(1 << 15)
> >  /* Support for a virtual guest idle state is available */
> >  #define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5)
> >  /* Guest crash data handler available */
> > @@ -383,10 +384,12 @@ enum HV_GENERIC_SET_FORMAT {
> >  #define HV_HYPERCALL_RESULT_MASK	GENMASK_ULL(15, 0)
> >  #define HV_HYPERCALL_FAST_BIT		BIT(16)
> >  #define HV_HYPERCALL_VARHEAD_OFFSET	17
> > +#define HV_HYPERCALL_VARHEAD_MASK	GENMASK_ULL(26, 17)
> >  #define HV_HYPERCALL_REP_COMP_OFFSET	32
> >  #define HV_HYPERCALL_REP_COMP_MASK	GENMASK_ULL(43, 32)
> >  #define HV_HYPERCALL_REP_START_OFFSET	48
> >  #define HV_HYPERCALL_REP_START_MASK	GENMASK_ULL(59, 48)
> > +#define HV_XMM_BYTE_MAX			112
> >  
> >  /* hypercall status code */
> >  #define HV_STATUS_SUCCESS			0
> > diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> > index f37704497d8f..5d8acb00ab94 100644
> > --- a/arch/x86/include/asm/mshyperv.h
> > +++ b/arch/x86/include/asm/mshyperv.h
> > @@ -132,16 +132,13 @@ extern struct clocksource *hyperv_cs;
> >  extern void *hv_hypercall_pg;
> >  extern void  __percpu  **hyperv_pcpu_input_arg;
> >  
> > -static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
> > +static inline u64 __hv_do_hypercall(u64 control, void *input, void *output)
> 
> As some of the hypercalls either don't support or don't need to use fast
> xmm scheme I'd rather keep this function name as is and introduce a new
> one for fast xmm.

Okay. Anyway I don't strong opinion here.
So far I found only HVCALL_POST_MESSAGE doesn't support fast hypercall.
It's not clear from the TLFS spec.

> >  {
> >  	u64 input_address = input ? virt_to_phys(input) : 0;
> >  	u64 output_address = output ? virt_to_phys(output) : 0;
> >  	u64 hv_status;
> >  
> >  #ifdef CONFIG_X86_64
> > -	if (!hv_hypercall_pg)
> > -		return U64_MAX;
> > -
> 
> Later in this patch you still use this function directly, so you can't
> drop this test.

will fix it.

> >  	__asm__ __volatile__("mov %4, %%r8\n"
> >  			     CALL_NOSPEC
> >  			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
> > @@ -155,9 +152,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
> >  	u32 output_address_hi = upper_32_bits(output_address);
> >  	u32 output_address_lo = lower_32_bits(output_address);
> >  
> > -	if (!hv_hypercall_pg)
> > -		return U64_MAX;
> > -
> >  	__asm__ __volatile__(CALL_NOSPEC
> >  			     : "=A" (hv_status),
> >  			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
> > @@ -201,7 +195,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
> >  		return hv_status;
> >  }
> >  
> > -/* Fast hypercall with 16 bytes of input */
> > +/* Fast hypercall with 16 bytes of input and no output */
> >  static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
> >  {
> >  	u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
> > @@ -246,11 +240,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
> >  	u64 status;
> >  	u16 rep_comp;
> >  
> > +	if (unlikely(!hv_hypercall_pg))
> > +		return U64_MAX;
> > +
> >  	control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
> >  	control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
> >  
> >  	do {
> > -		status = hv_do_hypercall(control, input, output);
> > +		status = __hv_do_hypercall(control, input, output);
> >  		if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
> >  			return status;
> >  
> > @@ -267,6 +264,167 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
> >  	return status;
> >  }
> >  
> > +/* ibytes = fixed header size + var header size + data size in bytes */
> > +static inline u64 hv_do_xmm_fast_hypercall(
> > +	u32 varhead_code, void *input, size_t ibytes,
> > +	void *output, size_t obytes)
> > +{
> > +	u64 control = (u64)varhead_code | HV_HYPERCALL_FAST_BIT;
> > +	u64 hv_status;
> > +	u64 input1;
> > +	u64 input2;
> > +	size_t i_end = roundup(ibytes, 16);
> > +	size_t o_end = i_end + roundup(obytes, 16);
> > +	u64 *ixmm = (u64 *)input + 2;
> > +	u64 tmp[(o_end - 16) / 8] __aligned((16));
> > +
> > +	BUG_ON(i_end <= 16);
> > +	BUG_ON(o_end > HV_XMM_BYTE_MAX);
> > +	BUG_ON(!IS_ALIGNED((unsigned long)input, 16));
> > +	BUG_ON(!IS_ALIGNED((unsigned long)output, 16));
> > +
> > +	/* it's assumed that there are at least two inputs */
> > +	input1 = ((u64 *)input)[0];
> > +	input2 = ((u64 *)input)[1];
> > +
> > +	preempt_disable();
> 
> Don't you rather need kernel_fpu_begin() here (paired with
> kernel_fpu_end() at the end)?  This may affect your benchmark results
> noticably.

You're right. For that reason, it's intentional to NOT use
kernel_fpu_begin/end() for that reason. I'll add a comment on it.

> > +	if (o_end > 2 * 8)
> > +		__asm__ __volatile__("movdqa %%xmm0, %0" : : "m" (tmp[0]));
> > +	if (o_end > 4 * 8)
> > +		__asm__ __volatile__("movdqa %%xmm1, %0" : : "m" (tmp[2]));
> > +	if (o_end > 6 * 8)
> > +		__asm__ __volatile__("movdqa %%xmm2, %0" : : "m" (tmp[4]));
> > +	if (o_end > 8 * 8)
> > +		__asm__ __volatile__("movdqa %%xmm3, %0" : : "m" (tmp[6]));
> > +	if (o_end > 10 * 8)
> > +		__asm__ __volatile__("movdqa %%xmm4, %0" : : "m" (tmp[8]));
> > +	if (o_end > 12 * 8)
> > +		__asm__ __volatile__("movdqa %%xmm5, %0" : : "m" (tmp[10]));
> > +	if (ibytes > 2 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (ixmm[0]));
> > +	if (ibytes > 4 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (ixmm[2]));
> > +	if (ibytes > 6 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (ixmm[4]));
> > +	if (ibytes > 8 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (ixmm[6]));
> > +	if (ibytes > 10 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (ixmm[8]));
> > +	if (ibytes > 12 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (ixmm[10]));
> > +
> > +#ifdef CONFIG_X86_64
> > +	__asm__ __volatile__("mov %4, %%r8\n"
> > +			     CALL_NOSPEC
> > +			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
> > +			       "+c" (control), "+d" (input1)
> > +			     : "r" (input2),
> > +			       THUNK_TARGET(hv_hypercall_pg)
> > +			     : "cc", "memory", "r8", "r9", "r10", "r11");
> > +#else
> > +	{
> > +		u32 input1_hi = upper_32_bits(input1);
> > +		u32 input1_lo = lower_32_bits(input1);
> > +		u32 input2_hi = upper_32_bits(input2);
> > +		u32 input2_lo = lower_32_bits(input2);
> > +
> > +		__asm__ __volatile__ (CALL_NOSPEC
> > +				      : "=A"(hv_status),
> > +					"+c"(input1_lo), ASM_CALL_CONSTRAINT
> > +				      :	"A" (control), "b" (input1_hi),
> > +					"D"(input2_hi), "S"(input2_lo),
> > +					THUNK_TARGET(hv_hypercall_pg)
> > +				      : "cc", "memory");
> > +	}
> > +#endif
> > +	if (output) {
> > +		u64 *oxmm = (u64 *)output;
> > +		if (i_end <= 2 * 8 && 2 * 8 < o_end) {
> > +			__asm__ __volatile__(
> > +				"movdqa %%xmm0, %0" : "=m" (oxmm[0]));
> > +			oxmm += 2;
> > +		}
> > +		if (i_end <= 4 * 8 && 4 * 8 < o_end) {
> > +			__asm__ __volatile__(
> > +				"movdqa %%xmm1, %0" : "=m" (oxmm[0]));
> > +			oxmm += 2;
> > +		}
> > +		if (i_end <= 6 * 8 && 6 * 8 < o_end) {
> > +			__asm__ __volatile__(
> > +				"movdqa %%xmm2, %0" : "=m" (oxmm[0]));
> > +			oxmm += 2;
> > +		}
> > +		if (i_end <= 8 * 8 && 8 * 8 < o_end) {
> > +			__asm__ __volatile__(
> > +				"movdqa %%xmm3, %0" : "=m" (oxmm[0]));
> > +			oxmm += 2;
> > +		}
> > +		if (i_end <= 10 * 8 && 10 * 8 < o_end) {
> > +			__asm__ __volatile__(
> > +				"movdqa %%xmm4, %0" : "=m" (oxmm[0]));
> > +			oxmm += 2;
> > +		}
> > +		if (i_end <= 12 * 8 && 12 * 8 < o_end) {
> > +			__asm__ __volatile__(
> > +				"movdqa %%xmm5, %0" : "=m" (oxmm[0]));
> > +			oxmm += 2;
> > +		}
> > +	}
> > +	if (o_end > 2 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (tmp[0]));
> > +	if (o_end > 4 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (tmp[2]));
> > +	if (o_end > 6 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (tmp[4]));
> > +	if (o_end > 8 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (tmp[6]));
> > +	if (o_end > 10 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (tmp[8]));
> > +	if (o_end > 12 * 8)
> > +		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (tmp[10]));
> > +	preempt_enable();
> > +
> > +	return hv_status;
> > +}
> > +
> > +static inline u64 hv_do_hypercall(
> > +	u32 varhead_code,
> > +	void *input, size_t ibytes, void *output, size_t obytes)
> > +{
> > +	if (unlikely(!hv_hypercall_pg))
> > +		return U64_MAX;
> > +
> > +	/* fast hypercall */
> > +	if (output == NULL && ibytes <= 16) {
> > +		u64 *i = (u64*)input;
> > +
> > +		WARN_ON((varhead_code & HV_HYPERCALL_VARHEAD_MASK) != 0);
> > +		if (ibytes <= 8)
> > +			return hv_do_fast_hypercall8((u16)varhead_code, i[0]);
> > +
> > +		return hv_do_fast_hypercall16((u16)varhead_code, i[0], i[1]);
> > +	}
> > +
> > +	/* xmm fast hypercall */
> > +	if (static_cpu_has(X86_FEATURE_XMM) &&
> > +	    ms_hyperv.features & HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE &&
> > +	    roundup(ibytes, 16) + obytes <= HV_XMM_BYTE_MAX) {
> > +		if (output) {
> > +			if (ms_hyperv.features &
> > +			    HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE)
> > +				return hv_do_xmm_fast_hypercall(
> > +					varhead_code, input, ibytes,
> > +					output, obytes);
> > +		} else {
> > +			WARN_ON(obytes > 0);
> > +			return hv_do_xmm_fast_hypercall(
> > +				varhead_code, input, ibytes, NULL, 0);
> > +		}
> > +	}
> > +
> > +	return __hv_do_hypercall((u64)varhead_code, input, output);
> > +}
> > +
> >  /*
> >   * Hypervisor's notion of virtual processor ID is different from
> >   * Linux' notion of CPU ID. This information can only be retrieved
> > diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> > index 748a1c4172a6..b80293861c54 100644
> > --- a/drivers/hv/hv.c
> > +++ b/drivers/hv/hv.c
> > @@ -92,7 +92,8 @@ int hv_post_message(union hv_connection_id connection_id,
> >  	aligned_msg->payload_size = payload_size;
> >  	memcpy((void *)aligned_msg->payload, payload, payload_size);
> >  
> > -	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
> > +	/* fast hypercall doesn't seem supported */
> > +	status = __hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
> >  
> >  	/* Preemption must remain disabled until after the hypercall
> >  	 * so some other thread can't get scheduled onto this cpu and
> > diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> > index ee80e79db21a..ea4aab9a6d1c 100644
> > --- a/drivers/pci/controller/pci-hyperv.c
> > +++ b/drivers/pci/controller/pci-hyperv.c
> > @@ -461,7 +461,7 @@ struct hv_pcibus_device {
> >  	struct irq_domain *irq_domain;
> >  
> >  	/* hypercall arg, must not cross page boundary */
> > -	struct retarget_msi_interrupt retarget_msi_interrupt_params;
> > +	__attribute__((__aligned__(16))) struct retarget_msi_interrupt retarget_msi_interrupt_params;
> >  
> >  	spinlock_t retarget_msi_interrupt_lock;
> >  
> > @@ -984,8 +984,9 @@ static void hv_irq_unmask(struct irq_data *data)
> >  		}
> >  	}
> >  
> > -	res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
> > -			      params, NULL);
> > +	res = hv_do_hypercall(
> > +		HVCALL_RETARGET_INTERRUPT | (var_size << 17),
> > +		params, sizeof(*params) + var_size * 8, NULL, 0);
> 
> This probably isn't performance-critical and can be left as is.
> (Frankly I'm struggling to understand why this has to be a hypercall at
> all.)

If interrupt is pending, this hpyercall gives VMM a chance to inject
interrupt into VM.

thanks,
-- 
Isaku Yamahata <isaku.yamahata@xxxxxxxxx>