On Mon, Nov 11, 2024 at 12:59:47PM +0100, Peter Zijlstra wrote: > +/* > + * All the FASTOP magic above relies on there being *one* instance of this > + * so it can JMP back, avoiding RET and it's various thunks. > + */ > +static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) > { > ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; > > if (!(ctxt->d & ByteOp)) > fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; > > - asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" > + asm("push %[flags]; popf \n\t" > + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) > + ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE) > + JMP_NOSPEC > + "fastop_return: \n\t" > + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) > + "pushf; pop %[flags]\n" > : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), > [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT > : "c"(ctxt->src2.val)); Do Andrew is telling me the compiler is free to mess this up... Notably: https://github.com/llvm/llvm-project/issues/92161 In lieu of that, I wrote the below hack. It makes objtool sad (it don't like STT_FUNC calling STT_NOTYPE), but it should work if we ever run into the compiler being daft like that (it should fail to compile because of the duplicate fastop_return label, so it's not silent failure). Wear protective eye gear before continuing... --- --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -429,9 +429,9 @@ static inline void call_depth_return_thu #ifdef CONFIG_X86_64 -#define __CS_PREFIX \ +#define __CS_PREFIX(reg) \ ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ - ".ifc %V[thunk_target],\\rs\n" \ + ".ifc " reg ",\\rs\n" \ ".byte 0x2e\n" \ ".endif\n" \ ".endr\n" @@ -441,12 +441,12 @@ static inline void call_depth_return_thu * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ # define CALL_NOSPEC \ - __CS_PREFIX \ + __CS_PREFIX("%V[thunk_target]") \ "call __x86_indirect_thunk_%V[thunk_target]\n" -# define JMP_NOSPEC \ - __CS_PREFIX \ - "jmp __x86_indirect_thunk_%V[thunk_target]\n" +# define __JMP_NOSPEC(reg) \ + __CS_PREFIX(reg) \ + "jmp __x86_indirect_thunk_" reg "\n" # define THUNK_TARGET(addr) [thunk_target] "r" (addr) @@ -478,10 +478,10 @@ static inline void call_depth_return_thu "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_LFENCE) -# define JMP_NOSPEC \ +# define __JMP_NOSPEC(reg) \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ - "jmp *%[thunk_target]\n", \ + "jmp *%%" reg "\n", \ " jmp 901f;\n" \ " .align 16\n" \ "901: call 903f;\n" \ @@ -490,22 +490,25 @@ static inline void call_depth_return_thu " jmp 902b;\n" \ " .align 16\n" \ "903: lea 4(%%esp), %%esp;\n" \ - " pushl %[thunk_target];\n" \ + " pushl %%" reg "\n" \ " ret;\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ - "jmp *%[thunk_target]\n", \ + "jmp *%%" reg "\n", \ X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif + #else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" -# define JMP_NOSPEC "jmp *%[thunk_target]\n" +# define __JMP_NOSPEC(reg) "jmp *%%" reg "\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif +# define JMP_NOSPEC __JMP_NOSPEC("%V[thunk_target]") + /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5039,23 +5039,45 @@ static void fetch_possible_mmx_operand(s } /* + * Stub written in asm in order to ensure GCC doesn't duplicate the + * fastop_return: label. + * + * Custom calling convention. + * + * __fastop: + * ax = ctxt->dst.val + * dx = ctxt->src.val + * cx = ctxt->src.val2 + * di = flags + * si = fop + */ +asm (ASM_FUNC_ALIGN + "__fastop: \n\t" + "push %" _ASM_DI "\n\t" + "popf \n\t" + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) + ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE) + __JMP_NOSPEC(_ASM_SI) + "fastop_return: \n\t" + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) + "pushf \n\t" + "pop %" _ASM_DI "\n\t" + ASM_RET + ".type __fastop, @notype \n\t" + ".size __fastop, . - __fastop \n\t"); + +/* * All the FASTOP magic above relies on there being *one* instance of this * so it can JMP back, avoiding RET and it's various thunks. */ -static noinline int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; - asm("push %[flags]; popf \n\t" - UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) - ASM_ANNOTATE(ANNOTYPE_JUMP_TABLE) - JMP_NOSPEC - "fastop_return: \n\t" - UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) - "pushf; pop %[flags]\n" + asm("call __fastop" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val));