Re: [PATCH v2 1/6] x86/kernel/hyper-v: xmm fast hypercall as guest

Roman Kagan <rkagan@xxxxxxxxxxxxx> · Mon, 29 Oct 2018 18:54:50 +0000

On Wed, Oct 24, 2018 at 09:48:26PM -0700, Isaku Yamahata wrote:
> hyper-v hypercall supports xmm fast hypercall
> where argument is exchanged though regular/xmm registers.
> This patch implements them and make use of them.
> With this patch, hyperv/hv_apic.c and hyperv/mmu.c will use (xmm) fast
> hypercall.
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
>  arch/x86/hyperv/mmu.c               |   4 +-
>  arch/x86/hyperv/nested.c            |   2 +-
>  arch/x86/include/asm/hyperv-tlfs.h  |   3 +
>  arch/x86/include/asm/mshyperv.h     | 176 ++++++++++++++++++++++++++++++++++--
>  drivers/hv/hv.c                     |   3 +-
>  drivers/pci/controller/pci-hyperv.c |   7 +-
>  6 files changed, 179 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
> index ef5f29f913d7..41820372bb3d 100644
> --- a/arch/x86/hyperv/mmu.c
> +++ b/arch/x86/hyperv/mmu.c
> @@ -134,11 +134,11 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
>  	if (info->end == TLB_FLUSH_ALL) {
>  		flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
>  		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> -					 flush, NULL);
> +					 flush, sizeof(*flush), NULL, 0);
>  	} else if (info->end &&
>  		   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
>  		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> -					 flush, NULL);
> +					 flush, sizeof(*flush), NULL, 0);
>  	} else {
>  		gva_n = fill_gva_list(flush->gva_list, 0,
>  				      info->start, info->end);
> diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
> index b8e60cc50461..5fd24f4f2ae3 100644
> --- a/arch/x86/hyperv/nested.c
> +++ b/arch/x86/hyperv/nested.c
> @@ -43,7 +43,7 @@ int hyperv_flush_guest_mapping(u64 as)
>  	flush->flags = 0;
>  
>  	status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE,
> -				 flush, NULL);
> +				 flush, sizeof(*flush), NULL, 0);
>  	local_irq_restore(flags);
>  
>  	if (!(status & HV_HYPERCALL_RESULT_MASK))
> diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
> index 00e01d215f74..d80e0151b790 100644
> --- a/arch/x86/include/asm/hyperv-tlfs.h
> +++ b/arch/x86/include/asm/hyperv-tlfs.h
> @@ -123,6 +123,7 @@
>   * registers is available
>   */
>  #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4)
> +#define HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE		(1 << 15)
>  /* Support for a virtual guest idle state is available */
>  #define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5)
>  /* Guest crash data handler available */
> @@ -383,10 +384,12 @@ enum HV_GENERIC_SET_FORMAT {
>  #define HV_HYPERCALL_RESULT_MASK	GENMASK_ULL(15, 0)
>  #define HV_HYPERCALL_FAST_BIT		BIT(16)
>  #define HV_HYPERCALL_VARHEAD_OFFSET	17
> +#define HV_HYPERCALL_VARHEAD_MASK	GENMASK_ULL(26, 17)
>  #define HV_HYPERCALL_REP_COMP_OFFSET	32
>  #define HV_HYPERCALL_REP_COMP_MASK	GENMASK_ULL(43, 32)
>  #define HV_HYPERCALL_REP_START_OFFSET	48
>  #define HV_HYPERCALL_REP_START_MASK	GENMASK_ULL(59, 48)
> +#define HV_XMM_BYTE_MAX			112
>  
>  /* hypercall status code */
>  #define HV_STATUS_SUCCESS			0
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index f37704497d8f..5d8acb00ab94 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -132,16 +132,13 @@ extern struct clocksource *hyperv_cs;
>  extern void *hv_hypercall_pg;
>  extern void  __percpu  **hyperv_pcpu_input_arg;
>  
> -static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
> +static inline u64 __hv_do_hypercall(u64 control, void *input, void *output)

As some of the hypercalls either don't support or don't need to use fast
xmm scheme I'd rather keep this function name as is and introduce a new
one for fast xmm.

>  {
>  	u64 input_address = input ? virt_to_phys(input) : 0;
>  	u64 output_address = output ? virt_to_phys(output) : 0;
>  	u64 hv_status;
>  
>  #ifdef CONFIG_X86_64
> -	if (!hv_hypercall_pg)
> -		return U64_MAX;
> -

Later in this patch you still use this function directly, so you can't
drop this test.

>  	__asm__ __volatile__("mov %4, %%r8\n"
>  			     CALL_NOSPEC
>  			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
> @@ -155,9 +152,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
>  	u32 output_address_hi = upper_32_bits(output_address);
>  	u32 output_address_lo = lower_32_bits(output_address);
>  
> -	if (!hv_hypercall_pg)
> -		return U64_MAX;
> -
>  	__asm__ __volatile__(CALL_NOSPEC
>  			     : "=A" (hv_status),
>  			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
> @@ -201,7 +195,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
>  		return hv_status;
>  }
>  
> -/* Fast hypercall with 16 bytes of input */
> +/* Fast hypercall with 16 bytes of input and no output */
>  static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
>  {
>  	u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
> @@ -246,11 +240,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
>  	u64 status;
>  	u16 rep_comp;
>  
> +	if (unlikely(!hv_hypercall_pg))
> +		return U64_MAX;
> +
>  	control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
>  	control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
>  
>  	do {
> -		status = hv_do_hypercall(control, input, output);
> +		status = __hv_do_hypercall(control, input, output);
>  		if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
>  			return status;
>  
> @@ -267,6 +264,167 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
>  	return status;
>  }
>  
> +/* ibytes = fixed header size + var header size + data size in bytes */
> +static inline u64 hv_do_xmm_fast_hypercall(
> +	u32 varhead_code, void *input, size_t ibytes,
> +	void *output, size_t obytes)
> +{
> +	u64 control = (u64)varhead_code | HV_HYPERCALL_FAST_BIT;
> +	u64 hv_status;
> +	u64 input1;
> +	u64 input2;
> +	size_t i_end = roundup(ibytes, 16);
> +	size_t o_end = i_end + roundup(obytes, 16);
> +	u64 *ixmm = (u64 *)input + 2;
> +	u64 tmp[(o_end - 16) / 8] __aligned((16));
> +
> +	BUG_ON(i_end <= 16);
> +	BUG_ON(o_end > HV_XMM_BYTE_MAX);
> +	BUG_ON(!IS_ALIGNED((unsigned long)input, 16));
> +	BUG_ON(!IS_ALIGNED((unsigned long)output, 16));
> +
> +	/* it's assumed that there are at least two inputs */
> +	input1 = ((u64 *)input)[0];
> +	input2 = ((u64 *)input)[1];
> +
> +	preempt_disable();

Don't you rather need kernel_fpu_begin() here (paired with
kernel_fpu_end() at the end)?  This may affect your benchmark results
noticably.

> +	if (o_end > 2 * 8)
> +		__asm__ __volatile__("movdqa %%xmm0, %0" : : "m" (tmp[0]));
> +	if (o_end > 4 * 8)
> +		__asm__ __volatile__("movdqa %%xmm1, %0" : : "m" (tmp[2]));
> +	if (o_end > 6 * 8)
> +		__asm__ __volatile__("movdqa %%xmm2, %0" : : "m" (tmp[4]));
> +	if (o_end > 8 * 8)
> +		__asm__ __volatile__("movdqa %%xmm3, %0" : : "m" (tmp[6]));
> +	if (o_end > 10 * 8)
> +		__asm__ __volatile__("movdqa %%xmm4, %0" : : "m" (tmp[8]));
> +	if (o_end > 12 * 8)
> +		__asm__ __volatile__("movdqa %%xmm5, %0" : : "m" (tmp[10]));
> +	if (ibytes > 2 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (ixmm[0]));
> +	if (ibytes > 4 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (ixmm[2]));
> +	if (ibytes > 6 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (ixmm[4]));
> +	if (ibytes > 8 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (ixmm[6]));
> +	if (ibytes > 10 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (ixmm[8]));
> +	if (ibytes > 12 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (ixmm[10]));
> +
> +#ifdef CONFIG_X86_64
> +	__asm__ __volatile__("mov %4, %%r8\n"
> +			     CALL_NOSPEC
> +			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
> +			       "+c" (control), "+d" (input1)
> +			     : "r" (input2),
> +			       THUNK_TARGET(hv_hypercall_pg)
> +			     : "cc", "memory", "r8", "r9", "r10", "r11");
> +#else
> +	{
> +		u32 input1_hi = upper_32_bits(input1);
> +		u32 input1_lo = lower_32_bits(input1);
> +		u32 input2_hi = upper_32_bits(input2);
> +		u32 input2_lo = lower_32_bits(input2);
> +
> +		__asm__ __volatile__ (CALL_NOSPEC
> +				      : "=A"(hv_status),
> +					"+c"(input1_lo), ASM_CALL_CONSTRAINT
> +				      :	"A" (control), "b" (input1_hi),
> +					"D"(input2_hi), "S"(input2_lo),
> +					THUNK_TARGET(hv_hypercall_pg)
> +				      : "cc", "memory");
> +	}
> +#endif
> +	if (output) {
> +		u64 *oxmm = (u64 *)output;
> +		if (i_end <= 2 * 8 && 2 * 8 < o_end) {
> +			__asm__ __volatile__(
> +				"movdqa %%xmm0, %0" : "=m" (oxmm[0]));
> +			oxmm += 2;
> +		}
> +		if (i_end <= 4 * 8 && 4 * 8 < o_end) {
> +			__asm__ __volatile__(
> +				"movdqa %%xmm1, %0" : "=m" (oxmm[0]));
> +			oxmm += 2;
> +		}
> +		if (i_end <= 6 * 8 && 6 * 8 < o_end) {
> +			__asm__ __volatile__(
> +				"movdqa %%xmm2, %0" : "=m" (oxmm[0]));
> +			oxmm += 2;
> +		}
> +		if (i_end <= 8 * 8 && 8 * 8 < o_end) {
> +			__asm__ __volatile__(
> +				"movdqa %%xmm3, %0" : "=m" (oxmm[0]));
> +			oxmm += 2;
> +		}
> +		if (i_end <= 10 * 8 && 10 * 8 < o_end) {
> +			__asm__ __volatile__(
> +				"movdqa %%xmm4, %0" : "=m" (oxmm[0]));
> +			oxmm += 2;
> +		}
> +		if (i_end <= 12 * 8 && 12 * 8 < o_end) {
> +			__asm__ __volatile__(
> +				"movdqa %%xmm5, %0" : "=m" (oxmm[0]));
> +			oxmm += 2;
> +		}
> +	}
> +	if (o_end > 2 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (tmp[0]));
> +	if (o_end > 4 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (tmp[2]));
> +	if (o_end > 6 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (tmp[4]));
> +	if (o_end > 8 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (tmp[6]));
> +	if (o_end > 10 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (tmp[8]));
> +	if (o_end > 12 * 8)
> +		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (tmp[10]));
> +	preempt_enable();
> +
> +	return hv_status;
> +}
> +
> +static inline u64 hv_do_hypercall(
> +	u32 varhead_code,
> +	void *input, size_t ibytes, void *output, size_t obytes)
> +{
> +	if (unlikely(!hv_hypercall_pg))
> +		return U64_MAX;
> +
> +	/* fast hypercall */
> +	if (output == NULL && ibytes <= 16) {
> +		u64 *i = (u64*)input;
> +
> +		WARN_ON((varhead_code & HV_HYPERCALL_VARHEAD_MASK) != 0);
> +		if (ibytes <= 8)
> +			return hv_do_fast_hypercall8((u16)varhead_code, i[0]);
> +
> +		return hv_do_fast_hypercall16((u16)varhead_code, i[0], i[1]);
> +	}
> +
> +	/* xmm fast hypercall */
> +	if (static_cpu_has(X86_FEATURE_XMM) &&
> +	    ms_hyperv.features & HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE &&
> +	    roundup(ibytes, 16) + obytes <= HV_XMM_BYTE_MAX) {
> +		if (output) {
> +			if (ms_hyperv.features &
> +			    HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE)
> +				return hv_do_xmm_fast_hypercall(
> +					varhead_code, input, ibytes,
> +					output, obytes);
> +		} else {
> +			WARN_ON(obytes > 0);
> +			return hv_do_xmm_fast_hypercall(
> +				varhead_code, input, ibytes, NULL, 0);
> +		}
> +	}
> +
> +	return __hv_do_hypercall((u64)varhead_code, input, output);
> +}
> +
>  /*
>   * Hypervisor's notion of virtual processor ID is different from
>   * Linux' notion of CPU ID. This information can only be retrieved
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 748a1c4172a6..b80293861c54 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -92,7 +92,8 @@ int hv_post_message(union hv_connection_id connection_id,
>  	aligned_msg->payload_size = payload_size;
>  	memcpy((void *)aligned_msg->payload, payload, payload_size);
>  
> -	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
> +	/* fast hypercall doesn't seem supported */
> +	status = __hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
>  
>  	/* Preemption must remain disabled until after the hypercall
>  	 * so some other thread can't get scheduled onto this cpu and
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index ee80e79db21a..ea4aab9a6d1c 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -461,7 +461,7 @@ struct hv_pcibus_device {
>  	struct irq_domain *irq_domain;
>  
>  	/* hypercall arg, must not cross page boundary */
> -	struct retarget_msi_interrupt retarget_msi_interrupt_params;
> +	__attribute__((__aligned__(16))) struct retarget_msi_interrupt retarget_msi_interrupt_params;
>  
>  	spinlock_t retarget_msi_interrupt_lock;
>  
> @@ -984,8 +984,9 @@ static void hv_irq_unmask(struct irq_data *data)
>  		}
>  	}
>  
> -	res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
> -			      params, NULL);
> +	res = hv_do_hypercall(
> +		HVCALL_RETARGET_INTERRUPT | (var_size << 17),
> +		params, sizeof(*params) + var_size * 8, NULL, 0);

This probably isn't performance-critical and can be left as is.
(Frankly I'm struggling to understand why this has to be a hypercall at
all.)

Thanks,
Roman.