Re: [PATCH RFC] kvm: emulate avx vmovdq

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Aug 20, 2024 at 04:04:31PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@xxxxxxxxxx>
> 
> Because people would like to use this (see "Link"), interpret the VEX
> prefix and emulate mov instrutions accordingly. The only avx
> instructions emulated here are the aligned and unaligned mov.
> Everything else will fail as before.
> 
> This is new territory for me, so any feedback is appreciated.
> 
> To test, I executed the following program against a qemu emulated pci
> device resource. Prior to this kernel patch, it would fail with
> 
>   traps: vmovdq[378] trap invalid opcode ip:4006b2 sp:7ffe2f5bb680 error:0 in vmovdq[6b2,400000+1000]
> 
> And is successful with this kernel patch.
> 
> Test program, vmovdq.c:
> 
>   #include <x86intrin.h>
>   #include <fcntl.h>
>   #include <stdint.h>
>   #include <stdio.h>
>   #include <string.h>
>   #include <unistd.h>
>   #include <sys/mman.h>
> 
>   static inline void read_avx_reg(__m256i *data)
>   {
>           asm("vmovdqu %%ymm0, %0" : "=m"(*data));
>   }
> 
>   static inline void write_avx_reg(const __m256i *data)
>   {
>           asm("vmovdqu %0, %%ymm0" : : "m"(*data));
>   }
> 
>   int main(int argc, char **argv)
>   {
>           __m256i s, *d;
>           void *map;
>           int fd;
> 
>           if(argc < 2) {
>                   fprintf(stderr, "usage: %s <resource-file>\n", argv[1]);
>                   return 1;
>           }
> 
>           fd = open(argv[1], O_RDWR | O_SYNC);
>           if (fd < 0) {
>                   fprintf(stderr, "failed to open %s\n", argv[1]);
>                   return 1;
>           }
> 
>           map = mmap(0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>           if (map == MAP_FAILED) {
>                   fprintf(stderr, "failed to mmap %s\n", argv[1]);
>                   return 1;
> 
>           }
> 
>           memset(&s, 0xd0, sizeof(s));
>           d = (__m256i *)map;
> 
>           write_avx_reg(&s);
>           read_avx_reg(d);
> 
>           write_avx_reg(d);
>           read_avx_reg(&s);
> 
>           return 0;
>   }
> 
> Link: https://lore.kernel.org/kvm/BD108C42-0382-4B17-B601-434A4BD038E7@xxxxxx/T/
> Cc: Alex Williamson <alex.williamson@xxxxxxxxxx>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Xu Liu <liuxu@xxxxxxxx>
> Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx>
> ---
>  arch/x86/kvm/emulate.c     | 136 ++++++++++++++++++++++++++++++++-----
>  arch/x86/kvm/fpu.h         |  62 +++++++++++++++++
>  arch/x86/kvm/kvm_emulate.h |   6 +-
>  3 files changed, 187 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index e72aed25d7212..aad8da15b6b77 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -1144,6 +1144,19 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
>  	else
>  		reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
>  
> +	if (ctxt->d & Avx) {
> +		op->bytes = ctxt->op_bytes;
> +		if (op->bytes == 16) {
> +			op->type = OP_XMM;
> +			op->addr.xmm = reg;
> +			kvm_read_sse_reg(reg, &op->vec_val);
> +		} else {
> +			op->type = OP_YMM;
> +			op->addr.ymm = reg;
> +			kvm_read_avx_reg(reg, &op->vec_val2);
> +		}
> +		return;
> +	}
>  	if (ctxt->d & Sse) {
>  		op->type = OP_XMM;
>  		op->bytes = 16;
> @@ -1177,13 +1190,24 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
>  			struct operand *op)
>  {
>  	u8 sib;
> -	int index_reg, base_reg, scale;
> +	int index_reg = 0, base_reg = 0, scale = 0;
>  	int rc = X86EMUL_CONTINUE;
>  	ulong modrm_ea = 0;
>  
> -	ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
> -	index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
> -	base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
> +	if (ctxt->vex_prefix[0]) {
> +		if ((ctxt->vex_prefix[1] & 0x80) == 0)  /* VEX._R */
> +			ctxt->modrm_reg = 8;
> +		if (ctxt->vex_prefix[0] == 0xc4) {
> +			if ((ctxt->vex_prefix[1] & 0x40) == 0) /* VEX._X */
> +				index_reg = 8;
> +			if ((ctxt->vex_prefix[1] & 0x20) == 0) /* VEX._B */
> +				base_reg = 8;
> +		}
> +	} else {
> +		ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
> +		index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
> +		base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
> +	}
>  
>  	ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
>  	ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
> @@ -1195,6 +1219,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
>  		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
>  		op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
>  				ctxt->d & ByteOp);
> +		if (ctxt->d & Avx) {
> +			op->bytes = ctxt->op_bytes;
> +			if (op->bytes == 16) {
> +				op->type = OP_XMM;
> +				op->addr.xmm = ctxt->modrm_rm;
> +				kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
> +			} else {
> +				op->type = OP_YMM;
> +				op->addr.ymm = ctxt->modrm_rm;
> +				kvm_read_avx_reg(ctxt->modrm_rm, &op->vec_val2);
> +			}
> +			return rc;
> +		}
>  		if (ctxt->d & Sse) {
>  			op->type = OP_XMM;
>  			op->bytes = 16;
> @@ -1808,6 +1845,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
>  	case OP_XMM:
>  		kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
>  		break;
> +	case OP_YMM:
> +		kvm_write_avx_reg(op->addr.ymm, &op->vec_val2);
> +		break;
>  	case OP_MM:
>  		kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
>  		break;
> @@ -3232,7 +3272,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
>  
>  static int em_mov(struct x86_emulate_ctxt *ctxt)
>  {
> -	memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
> +	memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
>  	return X86EMUL_CONTINUE;
>  }
>  
> @@ -4460,6 +4500,23 @@ static const struct opcode twobyte_table[256] = {
>  	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
>  };
>  
> +static const struct gprefix pfx_avx_0f_6f_0f_7f = {
> +	N, I(Avx | Aligned, em_mov), N, I(Avx | Unaligned, em_mov),
> +};
> +
> +static const struct opcode avx_0f_table[256] = {
> +	/* 0x00 - 0x5f */
> +	X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> +	/* 0x60 - 0x6F */
> +	X8(N), X4(N), X2(N), N,
> +	GP(SrcMem | DstReg | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> +	/* 0x70 - 0x7F */
> +	X8(N), X4(N), X2(N), N,
> +	GP(SrcReg | DstMem | ModRM | Mov, &pfx_avx_0f_6f_0f_7f),
> +	/* 0x80 - 0xFF */
> +	X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
> +};
> +
>  static const struct instr_dual instr_dual_0f_38_f0 = {
>  	I(DstReg | SrcMem | Mov, em_movbe), N
>  };
> @@ -4724,6 +4781,41 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
>  	return rc;
>  }
>  
> +static struct opcode x86_decode_avx(struct x86_emulate_ctxt *ctxt)
> +{
> +	u8 map, pp, l, v;
> +
> +	if (ctxt->vex_prefix[0] == 0xc5) {
> +		pp = ctxt->vex_prefix[1] & 0x3;	/* VEX.p1p0 */
> +		l = ctxt->vex_prefix[1] & 0x4;	/* VEX.L */
> +		v = ~((ctxt->vex_prefix[1] >> 3) & 0xf) & 0xf; /* VEX.v3v2v1v0 */
> +		map = 1; /* for 0f map */
> +		ctxt->opcode_len = 2;
> +	} else {
> +		map = ctxt->vex_prefix[1] & 0x1f;
> +		pp = ctxt->vex_prefix[2] & 0x3;
> +		l = ctxt->vex_prefix[2] & 0x4;
> +		v = ~((ctxt->vex_prefix[2] >> 3) & 0xf) & 0xf;
> +		ctxt->opcode_len = 3;
> +	}
> +
> +	if (l)
> +		ctxt->op_bytes = 32;
> +	else
> +		ctxt->op_bytes = 16;
> +
> +	switch (pp) {
> +	case 0: ctxt->rep_prefix = 0x00; break;
> +	case 1: ctxt->rep_prefix = 0x66; break;
> +	case 2: ctxt->rep_prefix = 0xf3; break;
> +	case 3: ctxt->rep_prefix = 0xf2; break;
> +	}
> +
> +	if (map == 1 && !v)
> +		return avx_0f_table[ctxt->b];
> +	return (struct opcode){.flags = NotImpl};

Can we check whether the host supports AVX? I.e. if the host does not support
AVX, set NotImpl. I am thinking that if the host does not support AVX, perhaps
the guest executing AVX instructions will cause the host to panic, because the
host will execute AVX instructions during the simulation.

Yeah if the host does not support AVX, it may not report AVX to the guest, but
the guest can always ignore the AVX check, such as the code in the commit.

> +}
> +
>  int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
>  {
>  	int rc = X86EMUL_CONTINUE;
> @@ -4777,7 +4869,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
>  	ctxt->op_bytes = def_op_bytes;
>  	ctxt->ad_bytes = def_ad_bytes;
>  
> -	/* Legacy prefixes. */
> +	/* prefixes. */
>  	for (;;) {
>  		switch (ctxt->b = insn_fetch(u8, ctxt)) {
>  		case 0x66:	/* operand-size override */
> @@ -4822,6 +4914,19 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
>  				goto done_prefixes;
>  			ctxt->rex_prefix = ctxt->b;
>  			continue;
> +		case 0xc4: /* VEX */
> +			if (mode != X86EMUL_MODE_PROT64)
> +				goto done_prefixes;
> +			ctxt->vex_prefix[0] = ctxt->b;
> +			ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
> +			ctxt->vex_prefix[2] = insn_fetch(u8, ctxt);
> +			break;
> +		case 0xc5: /* VEX */
> +			if (mode != X86EMUL_MODE_PROT64)
> +				goto done_prefixes;
> +			ctxt->vex_prefix[0] = ctxt->b;
> +			ctxt->vex_prefix[1] = insn_fetch(u8, ctxt);
> +			break;
>  		case 0xf0:	/* LOCK */
>  			ctxt->lock_prefix = 1;
>  			break;
> @@ -4844,10 +4949,10 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
>  	if (ctxt->rex_prefix & 8)
>  		ctxt->op_bytes = 8;	/* REX.W */
>  
> -	/* Opcode byte(s). */
> -	opcode = opcode_table[ctxt->b];
> -	/* Two-byte opcode? */
> -	if (ctxt->b == 0x0f) {
> +	if (ctxt->vex_prefix[0]) {
> +		opcode = x86_decode_avx(ctxt);
> +	} else if (ctxt->b == 0x0f) {
> +		/* Two-byte opcode? */
>  		ctxt->opcode_len = 2;
>  		ctxt->b = insn_fetch(u8, ctxt);
>  		opcode = twobyte_table[ctxt->b];
> @@ -4858,18 +4963,16 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
>  			ctxt->b = insn_fetch(u8, ctxt);
>  			opcode = opcode_map_0f_38[ctxt->b];
>  		}
> +	} else {
> +		/* Opcode byte(s). */
> +		opcode = opcode_table[ctxt->b];
>  	}
> +
>  	ctxt->d = opcode.flags;
>  
>  	if (ctxt->d & ModRM)
>  		ctxt->modrm = insn_fetch(u8, ctxt);
>  
> -	/* vex-prefix instructions are not implemented */
> -	if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
> -	    (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
> -		ctxt->d = NotImpl;
> -	}
> -
>  	while (ctxt->d & GroupMask) {
>  		switch (ctxt->d & GroupMask) {
>  		case Group:
> @@ -5091,6 +5194,7 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt)
>  	/* Clear fields that are set conditionally but read without a guard. */
>  	ctxt->rip_relative = false;
>  	ctxt->rex_prefix = 0;
> +	memset(ctxt->vex_prefix, 0, sizeof(ctxt->vex_prefix));;
                                                             ^^
Two ; here.

>  	ctxt->lock_prefix = 0;
>  	ctxt->rep_prefix = 0;
>  	ctxt->regs_valid = 0;
> diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
> index 3ba12888bf66a..9bc08c3c53f5d 100644
> --- a/arch/x86/kvm/fpu.h
> +++ b/arch/x86/kvm/fpu.h
> @@ -15,6 +15,54 @@ typedef u32		__attribute__((vector_size(16))) sse128_t;
>  #define sse128_l3(x)	({ __sse128_u t; t.vec = x; t.as_u32[3]; })
>  #define sse128(lo, hi)	({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
>  
> +typedef u32		__attribute__((vector_size(32))) avx256_t;
> +
> +static inline void _kvm_read_avx_reg(int reg, avx256_t *data)
> +{
> +	switch (reg) {
> +	case 0:  asm("vmovdqa %%ymm0,  %0" : "=m"(*data)); break;
> +	case 1:  asm("vmovdqa %%ymm1,  %0" : "=m"(*data)); break;
> +	case 2:  asm("vmovdqa %%ymm2,  %0" : "=m"(*data)); break;
> +	case 3:  asm("vmovdqa %%ymm3,  %0" : "=m"(*data)); break;
> +	case 4:  asm("vmovdqa %%ymm4,  %0" : "=m"(*data)); break;
> +	case 5:  asm("vmovdqa %%ymm5,  %0" : "=m"(*data)); break;
> +	case 6:  asm("vmovdqa %%ymm6,  %0" : "=m"(*data)); break;
> +	case 7:  asm("vmovdqa %%ymm7,  %0" : "=m"(*data)); break;
> +	case 8:  asm("vmovdqa %%ymm8,  %0" : "=m"(*data)); break;
> +	case 9:  asm("vmovdqa %%ymm9,  %0" : "=m"(*data)); break;
> +	case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break;
> +	case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break;
> +	case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break;
> +	case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break;
> +	case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break;
> +	case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break;
> +	default: BUG();
> +	}
> +}
> +
> +static inline void _kvm_write_avx_reg(int reg, const avx256_t *data)
> +{
> +	switch (reg) {
> +	case 0:  asm("vmovdqa %0, %%ymm0"  : : "m"(*data)); break;
> +	case 1:  asm("vmovdqa %0, %%ymm1"  : : "m"(*data)); break;
> +	case 2:  asm("vmovdqa %0, %%ymm2"  : : "m"(*data)); break;
> +	case 3:  asm("vmovdqa %0, %%ymm3"  : : "m"(*data)); break;
> +	case 4:  asm("vmovdqa %0, %%ymm4"  : : "m"(*data)); break;
> +	case 5:  asm("vmovdqa %0, %%ymm5"  : : "m"(*data)); break;
> +	case 6:  asm("vmovdqa %0, %%ymm6"  : : "m"(*data)); break;
> +	case 7:  asm("vmovdqa %0, %%ymm7"  : : "m"(*data)); break;
> +	case 8:  asm("vmovdqa %0, %%ymm8"  : : "m"(*data)); break;
> +	case 9:  asm("vmovdqa %0, %%ymm9"  : : "m"(*data)); break;
> +	case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break;
> +	case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break;
> +	case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break;
> +	case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break;
> +	case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break;
> +	case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break;
> +	default: BUG();
> +	}
> +}
> +
>  static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
>  {
>  	switch (reg) {
> @@ -109,6 +157,20 @@ static inline void kvm_fpu_put(void)
>  	fpregs_unlock();
>  }
>  
> +static inline void kvm_read_avx_reg(int reg, avx256_t *data)
> +{
> +	kvm_fpu_get();
> +	_kvm_read_avx_reg(reg, data);
> +	kvm_fpu_put();
> +}
> +
> +static inline void kvm_write_avx_reg(int reg, const avx256_t  *data)
> +{
> +	kvm_fpu_get();
> +	_kvm_write_avx_reg(reg, data);
> +	kvm_fpu_put();
> +}
> +
>  static inline void kvm_read_sse_reg(int reg, sse128_t *data)
>  {
>  	kvm_fpu_get();
> diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
> index 55a18e2f2dcd9..0e12f187e0b57 100644
> --- a/arch/x86/kvm/kvm_emulate.h
> +++ b/arch/x86/kvm/kvm_emulate.h
> @@ -239,7 +239,7 @@ struct x86_emulate_ops {
>  
>  /* Type, address-of, and value of an instruction's operand. */
>  struct operand {
> -	enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
> +	enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
>  	unsigned int bytes;
>  	unsigned int count;
>  	union {
> @@ -253,13 +253,16 @@ struct operand {
>  			unsigned seg;
>  		} mem;
>  		unsigned xmm;
> +		unsigned ymm;
>  		unsigned mm;
>  	} addr;
>  	union {
>  		unsigned long val;
>  		u64 val64;
>  		char valptr[sizeof(sse128_t)];
> +		char valptr2[sizeof(avx256_t)];
>  		sse128_t vec_val;
> +		avx256_t vec_val2;
>  		u64 mm_val;
>  		void *data;
>  	};
> @@ -347,6 +350,7 @@ struct x86_emulate_ctxt {
>  
>  	bool rip_relative;
>  	u8 rex_prefix;
> +	u8 vex_prefix[3];
>  	u8 lock_prefix;
>  	u8 rep_prefix;
>  	/* bitmaps of registers in _regs[] that can be read */
> -- 
> 2.43.5
> 
> 




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux