This is a note to let you know that I've just added the patch titled x86/alternatives: Optimize optimize_nops() to the 5.10-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: x86-alternatives-optimize-optimize_nops.patch and it can be found in the queue-5.10 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From foo@baz Tue Jul 12 05:07:35 PM CEST 2022 From: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Date: Fri, 26 Mar 2021 16:12:01 +0100 Subject: x86/alternatives: Optimize optimize_nops() From: Peter Zijlstra <peterz@xxxxxxxxxxxxx> commit 23c1ad538f4f371bdb67d8a112314842d5db7e5a upstream. Currently, optimize_nops() scans to see if the alternative starts with NOPs. However, the emit pattern is: 141: \oldinstr 142: .skip (len-(142b-141b)), 0x90 That is, when 'oldinstr' is short, the tail is padded with NOPs. This case never gets optimized. Rewrite optimize_nops() to replace any trailing string of NOPs inside the alternative to larger NOPs. Also run it irrespective of patching, replacing NOPs in both the original and replaced code. A direct consequence is that 'padlen' becomes superfluous, so remove it. [ bp: - Adjust commit message - remove a stale comment about needing to pad - add a comment in optimize_nops() - exit early if the NOP verif. loop catches a mismatch - function should not not add NOPs in that case - fix the "optimized NOPs" offsets output ] Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Signed-off-by: Borislav Petkov <bp@xxxxxxx> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx> Link: https://lkml.kernel.org/r/20210326151259.442992235@xxxxxxxxxxxxx Signed-off-by: Ben Hutchings <ben@xxxxxxxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- arch/x86/include/asm/alternative.h | 17 ++------- arch/x86/kernel/alternative.c | 49 ++++++++++++++++---------- tools/objtool/arch/x86/include/arch_special.h | 2 - 3 files changed, 37 insertions(+), 31 deletions(-) --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -65,7 +65,6 @@ struct alt_instr { u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction */ - u8 padlen; /* length of build-time padding */ } __packed; /* @@ -104,7 +103,6 @@ static inline int alternatives_text_rese #define alt_end_marker "663" #define alt_slen "662b-661b" -#define alt_pad_len alt_end_marker"b-662b" #define alt_total_slen alt_end_marker"b-661b" #define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" @@ -151,8 +149,7 @@ static inline int alternatives_text_rese " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte " alt_total_slen "\n" /* source len */ \ - " .byte " alt_rlen(num) "\n" /* replacement len */ \ - " .byte " alt_pad_len "\n" /* pad len */ + " .byte " alt_rlen(num) "\n" /* replacement len */ #define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ "# ALT: replacement " #num "\n" \ @@ -224,9 +221,6 @@ static inline int alternatives_text_rese * Peculiarities: * No memory clobber here. * Argument numbers start with 1. - * Best is to use constraints that are fixed size (like (%1) ... "r") - * If you use variable sized constraints like "m" or "g" in the - * replacement make sure to pad to the worst case length. * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ @@ -315,13 +309,12 @@ static inline int alternatives_text_rese * enough information for the alternatives patching code to patch an * instruction. See apply_alternatives(). */ -.macro altinstruction_entry orig alt feature orig_len alt_len pad_len +.macro altinstruction_entry orig alt feature orig_len alt_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len - .byte \pad_len .endm /* @@ -338,7 +331,7 @@ static inline int alternatives_text_rese 142: .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f .popsection .pushsection .altinstr_replacement,"ax" @@ -375,8 +368,8 @@ static inline int alternatives_text_rese 142: .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b - altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f .popsection .pushsection .altinstr_replacement,"ax" --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -344,19 +344,35 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; - int i; + struct insn insn; + int nop, i = 0; - for (i = 0; i < a->padlen; i++) { - if (instr[i] != 0x90) + /* + * Jump over the non-NOP insns, the remaining bytes must be single-byte + * NOPs, optimize them. + */ + for (;;) { + if (insn_decode_kernel(&insn, &instr[i])) + return; + + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + break; + + if ((i += insn.length) >= a->instrlen) + return; + } + + for (nop = i; i < a->instrlen; i++) { + if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i])) return; } local_irq_save(flags); - add_nops(instr + (a->instrlen - a->padlen), a->padlen); + add_nops(instr + nop, i - nop); local_irq_restore(flags); DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, a->instrlen - a->padlen, a->padlen); + instr, nop, a->instrlen); } /* @@ -402,19 +418,15 @@ void __init_or_module noinline apply_alt * - feature not present but ALTINSTR_FLAG_INV is set to mean, * patch if feature is *NOT* present. */ - if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) { - if (a->padlen > 1) - optimize_nops(a, instr); - - continue; - } + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + goto next; - DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", feature >> 5, feature & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen, a->padlen); + replacement, a->replacementlen); DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -438,14 +450,15 @@ void __init_or_module noinline apply_alt if (a->replacementlen && is_jmp(replacement[0])) recompute_jump(a, instr, replacement, insn_buff); - if (a->instrlen > a->replacementlen) { - add_nops(insn_buff + a->replacementlen, - a->instrlen - a->replacementlen); - insn_buff_sz += a->instrlen - a->replacementlen; - } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); + +next: + optimize_nops(a, instr); } } --- a/tools/objtool/arch/x86/include/arch_special.h +++ b/tools/objtool/arch/x86/include/arch_special.h @@ -10,7 +10,7 @@ #define JUMP_ORIG_OFFSET 0 #define JUMP_NEW_OFFSET 4 -#define ALT_ENTRY_SIZE 13 +#define ALT_ENTRY_SIZE 12 #define ALT_ORIG_OFFSET 0 #define ALT_NEW_OFFSET 4 #define ALT_FEATURE_OFFSET 8 Patches currently in stable-queue which might be from peterz@xxxxxxxxxxxxx are queue-5.10/objtool-cache-instruction-relocs.patch queue-5.10/x86-sev-avoid-using-__x86_return_thunk.patch queue-5.10/objtool-add-elf_create_undef_symbol.patch queue-5.10/x86-ftrace-use-alternative-ret-encoding.patch queue-5.10/objtool-re-add-unwind_hint_-save_restore.patch queue-5.10/x86-bugs-add-retbleed-ibpb.patch queue-5.10/x86-bugs-enable-stibp-for-jmp2ret.patch queue-5.10/x86-retpoline-cleanup-some-ifdefery.patch queue-5.10/objtool-handle-__sanitize_cov-tail-calls.patch queue-5.10/x86-prepare-asm-files-for-straight-line-speculation.patch queue-5.10/kvm-vmx-flatten-__vmx_vcpu_run.patch queue-5.10/x86-kvm-vmx-make-noinstr-clean.patch queue-5.10/objtool-x86-replace-alternatives-with-.retpoline_sites.patch queue-5.10/objtool-skip-magical-retpoline-.altinstr_replacement.patch queue-5.10/x86-retbleed-add-fine-grained-kconfig-knobs.patch queue-5.10/x86-cpu-amd-add-spectral-chicken.patch queue-5.10/objtool-add-straight-line-speculation-validation.patch queue-5.10/kvm-vmx-fix-ibrs-handling-after-vmexit.patch queue-5.10/kvm-vmx-prevent-guest-rsb-poisoning-attacks-with-eibrs.patch queue-5.10/x86-vsyscall_emu-64-don-t-use-ret-in-vsyscall-emulation.patch queue-5.10/tools-arch-update-arch-x86-lib-mem-cpy-set-_64.s-copies-used-in-perf-bench-mem-memcpy.patch queue-5.10/x86-add-straight-line-speculation-mitigation.patch queue-5.10/x86-add-magic-amd-return-thunk.patch queue-5.10/x86-bugs-keep-a-per-cpu-ia32_spec_ctrl-value.patch queue-5.10/x86-alternatives-optimize-optimize_nops.patch queue-5.10/x86-objtool-create-.return_sites.patch queue-5.10/crypto-x86-poly1305-fixup-sls.patch queue-5.10/x86-alternative-handle-jcc-__x86_indirect_thunk_-reg.patch queue-5.10/x86-kvm-fix-setcc-emulation-for-return-thunks.patch queue-5.10/objtool-fix-objtool-regression-on-x32-systems.patch queue-5.10/x86-alternative-relax-text_poke_bp-constraint.patch queue-5.10/x86-retpoline-swizzle-retpoline-thunk.patch queue-5.10/objtool-rework-the-elf_rebuild_reloc_section-logic.patch queue-5.10/x86-speculation-fix-firmware-entry-spec_ctrl-handling.patch queue-5.10/x86-retpoline-remove-unused-replacement-symbols.patch queue-5.10/objtool-fix-symbol-creation.patch queue-5.10/x86-speculation-add-spectre_v2-ibrs-option-to-support-kernel-ibrs.patch queue-5.10/bpf-x86-respect-x86_feature_retpoline.patch queue-5.10/objtool-fix-type-of-reloc-addend.patch queue-5.10/objtool-x86-rewrite-retpoline-thunk-calls.patch queue-5.10/x86-undo-return-thunk-damage.patch queue-5.10/x86-prepare-inline-asm-for-straight-line-speculation.patch queue-5.10/x86-alternative-support-alternative_ternary.patch queue-5.10/kvm-emulate-fix-setcc-emulation-function-offsets-with-sls.patch queue-5.10/objtool-handle-per-arch-retpoline-naming.patch queue-5.10/x86-retpoline-create-a-retpoline-thunk-array.patch queue-5.10/x86-retpoline-simplify-retpolines.patch queue-5.10/x86-asm-fix-register-order.patch queue-5.10/x86-speculation-fill-rsb-on-vmexit-for-ibrs.patch queue-5.10/objtool-add-entry-unret-validation.patch queue-5.10/objtool-keep-track-of-retpoline-call-sites.patch queue-5.10/kvm-vmx-convert-launched-argument-to-flags.patch queue-5.10/objtool-add-elf_create_reloc-helper.patch queue-5.10/objtool-make-.altinstructions-section-entry-size-consistent.patch queue-5.10/x86-bpf-use-alternative-ret-encoding.patch queue-5.10/x86-common-stamp-out-the-stepping-madness.patch queue-5.10/x86-bugs-split-spectre_v2_select_mitigation-and-spectre_v2_user_select_mitigation.patch queue-5.10/x86-bugs-report-intel-retbleed-vulnerability.patch queue-5.10/bpf-x86-simplify-computing-label-offsets.patch queue-5.10/x86-cpufeatures-move-retpoline-flags-to-word-11.patch queue-5.10/x86-speculation-fix-spec_ctrl-write-on-smt-state-change.patch queue-5.10/x86-retpoline-use-mfunction-return.patch queue-5.10/x86-xen-rename-sys-entry-points.patch queue-5.10/objtool-only-rewrite-unconditional-retpoline-thunk-calls.patch queue-5.10/x86-bugs-optimize-spec_ctrl-msr-writes.patch queue-5.10/x86-alternative-optimize-single-byte-nops-at-an-arbitrary-position.patch queue-5.10/objtool-fix-code-relocs-vs-weak-symbols.patch queue-5.10/x86-bugs-report-amd-retbleed-vulnerability.patch queue-5.10/x86-static_call-use-alternative-ret-encoding.patch queue-5.10/x86-speculation-fix-rsb-filling-with-config_retpoline-n.patch queue-5.10/x86-asm-fixup-odd-gen-for-each-reg.h-usage.patch queue-5.10/x86-alternative-add-debug-prints-to-apply_retpolines.patch queue-5.10/objtool-extract-elf_symbol_add.patch queue-5.10/x86-use-return-thunk-in-asm-code.patch queue-5.10/objtool-remove-reloc-symbol-type-checks-in-get_alt_entry.patch queue-5.10/objtool-classify-symbols.patch queue-5.10/intel_idle-disable-ibrs-during-long-idle.patch queue-5.10/objtool-correctly-handle-retpoline-thunk-calls.patch queue-5.10/objtool-fix-.symtab_shndx-handling-for-elf_create_undef_symbol.patch queue-5.10/x86-retpoline-move-the-retpoline-thunk-declarations-to-nospec-branch.h.patch queue-5.10/objtool-support-asm-jump-tables.patch queue-5.10/x86-alternative-implement-.retpoline_sites-support.patch queue-5.10/objtool-x86-ignore-__x86_indirect_alt_-symbols.patch queue-5.10/objtool-fix-sls-validation-for-kcov-tail-call-replacement.patch queue-5.10/x86-alternative-try-inline-spectre_v2-retpoline-amd.patch queue-5.10/x86-entry-remove-skip_r11rcx.patch queue-5.10/objtool-explicitly-avoid-self-modifying-code-in-.altinstr_replacement.patch queue-5.10/x86-speculation-use-cached-host-spec_ctrl-value-for-guest-entry-exit.patch queue-5.10/x86-bugs-add-amd-retbleed-boot-parameter.patch queue-5.10/objtool-create-reloc-sections-implicitly.patch queue-5.10/x86-entry-add-kernel-ibrs-implementation.patch queue-5.10/objtool-treat-.text.__x86.-as-noinstr.patch queue-5.10/x86-lib-atomic64_386_32-rename-things.patch queue-5.10/objtool-introduce-cfi-hash.patch queue-5.10/objtool-default-ignore-int3-for-unreachable.patch queue-5.10/objtool-extract-elf_strtab_concat.patch queue-5.10/objtool-teach-get_alt_entry-about-more-relocation-types.patch queue-5.10/objtool-update-retpoline-validation.patch