x86_decode_insn() can decode SSE instructions with multiple mandatory prefixes incorrectly. Apparently, f2/f3 overturns 66 regardless of relative position. Current code does the opposite: 66 wins and refuses to emulate instruction with both 66 and f2/f3 for no reason. This is easy to see switching between movdqa/movdqu with unaligned loads. .intel_syntax noprefix .global f f: push rbp mov rbp, rsp and rsp, ~15 add rsp, 1 #.byte 0x66, 0x0f, 0x6f, 0x04, 0x24 # SIGSEGV movdqa xmm0, [rsp] .byte 0xf3, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] #.byte 0x66, 0x66, 0x0f, 0x6f, 0x04, 0x24 # SIGSEGV movdqa xmm0, [rsp] .byte 0x66, 0xf3, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0xf3, 0x66, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0xf3, 0xf3, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] #.byte 0x66, 0x66, 0x66, 0x0f, 0x6f, 0x04, 0x24 # SIGSEGV movdqa xmm0, [rsp] .byte 0x66, 0x66, 0xf3, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0x66, 0xf3, 0x66, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0x66, 0xf3, 0xf3, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0xf3, 0x66, 0x66, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0xf3, 0x66, 0xf3, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0xf3, 0xf3, 0x66, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] .byte 0xf3, 0xf3, 0xf3, 0x0f, 0x6f, 0x04, 0x24 # OK movdqu xmm0, [rsp] mov rsp, rbp pop rbp ret Tentative patch below (how do you even test it?). References: objdump: opcodes/i386-dis.c get_valid_dis386() bochs: bochs/cpu/decoder/fetchdecode64.cc fetchDecode64() Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx> --- Do we even care given that compiler won't emit such instruction? arch/x86/kvm/emulate.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4763,8 +4763,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, goffset, simd_prefix; - bool op_prefix = false; + int def_op_bytes, def_ad_bytes, goffset; + u8 simd_prefix = 0; bool has_seg_override = false; struct opcode opcode; u16 dummy; @@ -4816,7 +4816,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int for (;;) { switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ - op_prefix = true; + /* + * Believe it or not, f2/f3 are more equal than 66. + * Homework: + * 1. [M21] Write a program to show this effect. + * 2. [M49] Find corresponding clause in SDM. + */ + simd_prefix = simd_prefix ?: 0x66; /* switch between 2/4 bytes */ ctxt->op_bytes = def_op_bytes ^ 6; break; @@ -4862,7 +4868,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int break; case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP/REPE/REPZ */ - ctxt->rep_prefix = ctxt->b; + ctxt->rep_prefix = simd_prefix = ctxt->b; break; default: goto done_prefixes; @@ -4923,9 +4929,6 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int opcode = opcode.u.group[goffset]; break; case Prefix: - if (ctxt->rep_prefix && op_prefix) - return EMULATION_FAILED; - simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; case 0x66: opcode = opcode.u.gprefix->pfx_66; break;