[PATCH v2 1/6] x86/kernel/hyper-v: xmm fast hypercall as guest

Isaku Yamahata <isaku.yamahata@xxxxxxxxx> · Wed, 24 Oct 2018 21:48:26 -0700

hyper-v hypercall supports xmm fast hypercall
where argument is exchanged though regular/xmm registers.
This patch implements them and make use of them.
With this patch, hyperv/hv_apic.c and hyperv/mmu.c will use (xmm) fast
hypercall.

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
---
 arch/x86/hyperv/mmu.c               |   4 +-
 arch/x86/hyperv/nested.c            |   2 +-
 arch/x86/include/asm/hyperv-tlfs.h  |   3 +
 arch/x86/include/asm/mshyperv.h     | 176 ++++++++++++++++++++++++++++++++++--
 drivers/hv/hv.c                     |   3 +-
 drivers/pci/controller/pci-hyperv.c |   7 +-
 6 files changed, 179 insertions(+), 16 deletions(-)

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index ef5f29f913d7..41820372bb3d 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -134,11 +134,11 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
 	if (info->end == TLB_FLUSH_ALL) {
 		flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
 		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
-					 flush, NULL);
+					 flush, sizeof(*flush), NULL, 0);
 	} else if (info->end &&
 		   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
 		status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
-					 flush, NULL);
+					 flush, sizeof(*flush), NULL, 0);
 	} else {
 		gva_n = fill_gva_list(flush->gva_list, 0,
 				      info->start, info->end);
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index b8e60cc50461..5fd24f4f2ae3 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -43,7 +43,7 @@ int hyperv_flush_guest_mapping(u64 as)
 	flush->flags = 0;
 
 	status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE,
-				 flush, NULL);
+				 flush, sizeof(*flush), NULL, 0);
 	local_irq_restore(flags);
 
 	if (!(status & HV_HYPERCALL_RESULT_MASK))
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 00e01d215f74..d80e0151b790 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -123,6 +123,7 @@
  * registers is available
  */
 #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4)
+#define HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE		(1 << 15)
 /* Support for a virtual guest idle state is available */
 #define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5)
 /* Guest crash data handler available */
@@ -383,10 +384,12 @@ enum HV_GENERIC_SET_FORMAT {
 #define HV_HYPERCALL_RESULT_MASK	GENMASK_ULL(15, 0)
 #define HV_HYPERCALL_FAST_BIT		BIT(16)
 #define HV_HYPERCALL_VARHEAD_OFFSET	17
+#define HV_HYPERCALL_VARHEAD_MASK	GENMASK_ULL(26, 17)
 #define HV_HYPERCALL_REP_COMP_OFFSET	32
 #define HV_HYPERCALL_REP_COMP_MASK	GENMASK_ULL(43, 32)
 #define HV_HYPERCALL_REP_START_OFFSET	48
 #define HV_HYPERCALL_REP_START_MASK	GENMASK_ULL(59, 48)
+#define HV_XMM_BYTE_MAX			112
 
 /* hypercall status code */
 #define HV_STATUS_SUCCESS			0
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f37704497d8f..5d8acb00ab94 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -132,16 +132,13 @@ extern struct clocksource *hyperv_cs;
 extern void *hv_hypercall_pg;
 extern void  __percpu  **hyperv_pcpu_input_arg;
 
-static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
+static inline u64 __hv_do_hypercall(u64 control, void *input, void *output)
 {
 	u64 input_address = input ? virt_to_phys(input) : 0;
 	u64 output_address = output ? virt_to_phys(output) : 0;
 	u64 hv_status;
 
 #ifdef CONFIG_X86_64
-	if (!hv_hypercall_pg)
-		return U64_MAX;
-
 	__asm__ __volatile__("mov %4, %%r8\n"
 			     CALL_NOSPEC
 			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
@@ -155,9 +152,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	u32 output_address_hi = upper_32_bits(output_address);
 	u32 output_address_lo = lower_32_bits(output_address);
 
-	if (!hv_hypercall_pg)
-		return U64_MAX;
-
 	__asm__ __volatile__(CALL_NOSPEC
 			     : "=A" (hv_status),
 			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
@@ -201,7 +195,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 		return hv_status;
 }
 
-/* Fast hypercall with 16 bytes of input */
+/* Fast hypercall with 16 bytes of input and no output */
 static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
 {
 	u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
@@ -246,11 +240,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
 	u64 status;
 	u16 rep_comp;
 
+	if (unlikely(!hv_hypercall_pg))
+		return U64_MAX;
+
 	control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
 	control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
 
 	do {
-		status = hv_do_hypercall(control, input, output);
+		status = __hv_do_hypercall(control, input, output);
 		if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
 			return status;
 
@@ -267,6 +264,167 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
 	return status;
 }
 
+/* ibytes = fixed header size + var header size + data size in bytes */
+static inline u64 hv_do_xmm_fast_hypercall(
+	u32 varhead_code, void *input, size_t ibytes,
+	void *output, size_t obytes)
+{
+	u64 control = (u64)varhead_code | HV_HYPERCALL_FAST_BIT;
+	u64 hv_status;
+	u64 input1;
+	u64 input2;
+	size_t i_end = roundup(ibytes, 16);
+	size_t o_end = i_end + roundup(obytes, 16);
+	u64 *ixmm = (u64 *)input + 2;
+	u64 tmp[(o_end - 16) / 8] __aligned((16));
+
+	BUG_ON(i_end <= 16);
+	BUG_ON(o_end > HV_XMM_BYTE_MAX);
+	BUG_ON(!IS_ALIGNED((unsigned long)input, 16));
+	BUG_ON(!IS_ALIGNED((unsigned long)output, 16));
+
+	/* it's assumed that there are at least two inputs */
+	input1 = ((u64 *)input)[0];
+	input2 = ((u64 *)input)[1];
+
+	preempt_disable();
+	if (o_end > 2 * 8)
+		__asm__ __volatile__("movdqa %%xmm0, %0" : : "m" (tmp[0]));
+	if (o_end > 4 * 8)
+		__asm__ __volatile__("movdqa %%xmm1, %0" : : "m" (tmp[2]));
+	if (o_end > 6 * 8)
+		__asm__ __volatile__("movdqa %%xmm2, %0" : : "m" (tmp[4]));
+	if (o_end > 8 * 8)
+		__asm__ __volatile__("movdqa %%xmm3, %0" : : "m" (tmp[6]));
+	if (o_end > 10 * 8)
+		__asm__ __volatile__("movdqa %%xmm4, %0" : : "m" (tmp[8]));
+	if (o_end > 12 * 8)
+		__asm__ __volatile__("movdqa %%xmm5, %0" : : "m" (tmp[10]));
+	if (ibytes > 2 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (ixmm[0]));
+	if (ibytes > 4 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (ixmm[2]));
+	if (ibytes > 6 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (ixmm[4]));
+	if (ibytes > 8 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (ixmm[6]));
+	if (ibytes > 10 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (ixmm[8]));
+	if (ibytes > 12 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (ixmm[10]));
+
+#ifdef CONFIG_X86_64
+	__asm__ __volatile__("mov %4, %%r8\n"
+			     CALL_NOSPEC
+			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+			       "+c" (control), "+d" (input1)
+			     : "r" (input2),
+			       THUNK_TARGET(hv_hypercall_pg)
+			     : "cc", "memory", "r8", "r9", "r10", "r11");
+#else
+	{
+		u32 input1_hi = upper_32_bits(input1);
+		u32 input1_lo = lower_32_bits(input1);
+		u32 input2_hi = upper_32_bits(input2);
+		u32 input2_lo = lower_32_bits(input2);
+
+		__asm__ __volatile__ (CALL_NOSPEC
+				      : "=A"(hv_status),
+					"+c"(input1_lo), ASM_CALL_CONSTRAINT
+				      :	"A" (control), "b" (input1_hi),
+					"D"(input2_hi), "S"(input2_lo),
+					THUNK_TARGET(hv_hypercall_pg)
+				      : "cc", "memory");
+	}
+#endif
+	if (output) {
+		u64 *oxmm = (u64 *)output;
+		if (i_end <= 2 * 8 && 2 * 8 < o_end) {
+			__asm__ __volatile__(
+				"movdqa %%xmm0, %0" : "=m" (oxmm[0]));
+			oxmm += 2;
+		}
+		if (i_end <= 4 * 8 && 4 * 8 < o_end) {
+			__asm__ __volatile__(
+				"movdqa %%xmm1, %0" : "=m" (oxmm[0]));
+			oxmm += 2;
+		}
+		if (i_end <= 6 * 8 && 6 * 8 < o_end) {
+			__asm__ __volatile__(
+				"movdqa %%xmm2, %0" : "=m" (oxmm[0]));
+			oxmm += 2;
+		}
+		if (i_end <= 8 * 8 && 8 * 8 < o_end) {
+			__asm__ __volatile__(
+				"movdqa %%xmm3, %0" : "=m" (oxmm[0]));
+			oxmm += 2;
+		}
+		if (i_end <= 10 * 8 && 10 * 8 < o_end) {
+			__asm__ __volatile__(
+				"movdqa %%xmm4, %0" : "=m" (oxmm[0]));
+			oxmm += 2;
+		}
+		if (i_end <= 12 * 8 && 12 * 8 < o_end) {
+			__asm__ __volatile__(
+				"movdqa %%xmm5, %0" : "=m" (oxmm[0]));
+			oxmm += 2;
+		}
+	}
+	if (o_end > 2 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm0" : : "m" (tmp[0]));
+	if (o_end > 4 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm1" : : "m" (tmp[2]));
+	if (o_end > 6 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm2" : : "m" (tmp[4]));
+	if (o_end > 8 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm3" : : "m" (tmp[6]));
+	if (o_end > 10 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm4" : : "m" (tmp[8]));
+	if (o_end > 12 * 8)
+		__asm__ __volatile__("movdqa %0, %%xmm5" : : "m" (tmp[10]));
+	preempt_enable();
+
+	return hv_status;
+}
+
+static inline u64 hv_do_hypercall(
+	u32 varhead_code,
+	void *input, size_t ibytes, void *output, size_t obytes)
+{
+	if (unlikely(!hv_hypercall_pg))
+		return U64_MAX;
+
+	/* fast hypercall */
+	if (output == NULL && ibytes <= 16) {
+		u64 *i = (u64*)input;
+
+		WARN_ON((varhead_code & HV_HYPERCALL_VARHEAD_MASK) != 0);
+		if (ibytes <= 8)
+			return hv_do_fast_hypercall8((u16)varhead_code, i[0]);
+
+		return hv_do_fast_hypercall16((u16)varhead_code, i[0], i[1]);
+	}
+
+	/* xmm fast hypercall */
+	if (static_cpu_has(X86_FEATURE_XMM) &&
+	    ms_hyperv.features & HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE &&
+	    roundup(ibytes, 16) + obytes <= HV_XMM_BYTE_MAX) {
+		if (output) {
+			if (ms_hyperv.features &
+			    HV_X64_HYPERCALL_OUTPUT_XMM_AVAILABLE)
+				return hv_do_xmm_fast_hypercall(
+					varhead_code, input, ibytes,
+					output, obytes);
+		} else {
+			WARN_ON(obytes > 0);
+			return hv_do_xmm_fast_hypercall(
+				varhead_code, input, ibytes, NULL, 0);
+		}
+	}
+
+	return __hv_do_hypercall((u64)varhead_code, input, output);
+}
+
 /*
  * Hypervisor's notion of virtual processor ID is different from
  * Linux' notion of CPU ID. This information can only be retrieved
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 748a1c4172a6..b80293861c54 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -92,7 +92,8 @@ int hv_post_message(union hv_connection_id connection_id,
 	aligned_msg->payload_size = payload_size;
 	memcpy((void *)aligned_msg->payload, payload, payload_size);
 
-	status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
+	/* fast hypercall doesn't seem supported */
+	status = __hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL);
 
 	/* Preemption must remain disabled until after the hypercall
 	 * so some other thread can't get scheduled onto this cpu and
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index ee80e79db21a..ea4aab9a6d1c 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -461,7 +461,7 @@ struct hv_pcibus_device {
 	struct irq_domain *irq_domain;
 
 	/* hypercall arg, must not cross page boundary */
-	struct retarget_msi_interrupt retarget_msi_interrupt_params;
+	__attribute__((__aligned__(16))) struct retarget_msi_interrupt retarget_msi_interrupt_params;
 
 	spinlock_t retarget_msi_interrupt_lock;
 
@@ -984,8 +984,9 @@ static void hv_irq_unmask(struct irq_data *data)
 		}
 	}
 
-	res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
-			      params, NULL);
+	res = hv_do_hypercall(
+		HVCALL_RETARGET_INTERRUPT | (var_size << 17),
+		params, sizeof(*params) + var_size * 8, NULL, 0);
 
 exit_unlock:
 	spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
-- 
2.14.1