The patchset extends the prior PIE kernel patch (by Thomas Garnier) to also
support position-independent modules that can be placed anywhere in the
48/64-bit address space (for better KASLR). The patch extends PIE v6.
The first part provides some fixes for the PIE patch as well as
some improvements/prerequisites for position-independent modules.
It also avoids generating the same object file in several places for
the kernel and modules.
Signed-off-by: Ruslan Nikolaev <rnikola@xxxxxx>
Signed-off-by: Hassan Nadeem <hnadeem@xxxxxx>
---
arch/x86/crypto/aes-x86_64-asm_64.S | 81
++++++++++++++++++------------
arch/x86/crypto/camellia-x86_64-asm_64.S | 5 +
arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 9 +--
arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 9 +--
arch/x86/include/asm/asm.h | 67 +++++++++++++++++++++++-
arch/x86/kernel/kvm.c | 8 +-
lib/zstd/Makefile | 2
lib/zstd/entropy_common_dec.c | 2
lib/zstd/fse_decompress_dec.c | 2
lib/zstd/zstd_common_dec.c | 2
10 files changed, 138 insertions(+), 49 deletions(-)
diff -uprN a/arch/x86/crypto/aes-x86_64-asm_64.S
b/arch/x86/crypto/aes-x86_64-asm_64.S
--- a/arch/x86/crypto/aes-x86_64-asm_64.S 2019-03-16 10:50:57.093692118
-0400
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S 2019-03-20 19:42:23.627815384
-0400
@@ -17,6 +17,7 @@
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
+#include <asm/asm.h>
#define R1 %rax
#define R1E %eax
@@ -48,12 +49,34 @@
#define R10 %r10
#define R11 %r11
-/* Hold global for PIE support */
+/* Hold global for PIE/PIC support */
#define RBASE %r12
-#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \
+#if defined(CONFIG_X86_PIE) || (defined(MODULE) && defined(CONFIG_X86_PIC))
+# define rbase_save \
+ pushq RBASE;
+# define rbase_restore \
+ popq RBASE;
+# define rbase_load(tab) \
+ _ASM_LEA(tab, %rip, RBASE);
+# define round_mov(tab, tab_off, reg_i, reg_o) \
+ movl tab_off(RBASE,reg_i,4), reg_o;
+# define round_xor(tab, tab_off, reg_i, reg_o) \
+ xorl tab_off(RBASE,reg_i,4), reg_o;
+#else
+# define rbase_save
+# define rbase_restore
+# define rbase_load(tab)
+# define round_mov(tab, tab_off, reg_i, reg_o) \
+ movl tab+tab_off(,reg_i,4), reg_o;
+# define round_xor(tab, tab_off, reg_i, reg_o) \
+ xorl tab+tab_off(,reg_i,4), reg_o;
+#endif
+
+#define prologue(FUNC,KEY,B128,B192,TAB,r1,r2,r5,r6,r7,r8,r9,r10,r11) \
ENTRY(FUNC); \
- pushq RBASE; \
+ rbase_save \
+ rbase_load(TAB) \
movq r1,r2; \
leaq KEY+48(r8),r9; \
movq r10,r11; \
@@ -78,70 +101,62 @@
movl r6 ## E,4(r9); \
movl r7 ## E,8(r9); \
movl r8 ## E,12(r9); \
- popq RBASE; \
+ rbase_restore \
ret; \
ENDPROC(FUNC);
-#define round_mov(tab_off, reg_i, reg_o) \
- leaq tab_off(%rip), RBASE; \
- movl (RBASE,reg_i,4), reg_o;
-
-#define round_xor(tab_off, reg_i, reg_o) \
- leaq tab_off(%rip), RBASE; \
- xorl (RBASE,reg_i,4), reg_o;
-
#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
movzbl r2 ## H,r5 ## E; \
movzbl r2 ## L,r6 ## E; \
- round_mov(TAB+1024, r5, r5 ## E)\
+ round_mov(TAB, 1024, r5, r5 ## E)\
movw r4 ## X,r2 ## X; \
- round_mov(TAB, r6, r6 ## E) \
+ round_mov(TAB, 0, r6, r6 ## E) \
roll $16,r2 ## E; \
shrl $16,r4 ## E; \
movzbl r4 ## L,r7 ## E; \
movzbl r4 ## H,r4 ## E; \
xorl OFFSET(r8),ra ## E; \
xorl OFFSET+4(r8),rb ## E; \
- round_xor(TAB+3072, r4, r5 ## E)\
- round_xor(TAB+2048, r7, r6 ## E)\
+ round_xor(TAB, 3072, r4, r5 ## E)\
+ round_xor(TAB, 2048, r7, r6 ## E)\
movzbl r1 ## L,r7 ## E; \
movzbl r1 ## H,r4 ## E; \
- round_mov(TAB+1024, r4, r4 ## E)\
+ round_mov(TAB, 1024, r4, r4 ## E)\
movw r3 ## X,r1 ## X; \
roll $16,r1 ## E; \
shrl $16,r3 ## E; \
- round_xor(TAB, r7, r5 ## E) \
+ round_xor(TAB, 0, r7, r5 ## E) \
movzbl r3 ## L,r7 ## E; \
movzbl r3 ## H,r3 ## E; \
- round_xor(TAB+3072, r3, r4 ## E)\
- round_xor(TAB+2048, r7, r5 ## E)\
+ round_xor(TAB, 3072, r3, r4 ## E)\
+ round_xor(TAB, 2048, r7, r5 ## E)\
movzbl r1 ## L,r7 ## E; \
movzbl r1 ## H,r3 ## E; \
shrl $16,r1 ## E; \
- round_xor(TAB+3072, r3, r6 ## E)\
- round_mov(TAB+2048, r7, r3 ## E)\
+ round_xor(TAB, 3072, r3, r6 ## E)\
+ round_mov(TAB, 2048, r7, r3 ## E)\
movzbl r1 ## L,r7 ## E; \
movzbl r1 ## H,r1 ## E; \
- round_xor(TAB+1024, r1, r6 ## E)\
- round_xor(TAB, r7, r3 ## E) \
+ round_xor(TAB, 1024, r1, r6 ## E)\
+ round_xor(TAB, 0, r7, r3 ## E) \
movzbl r2 ## H,r1 ## E; \
movzbl r2 ## L,r7 ## E; \
shrl $16,r2 ## E; \
- round_xor(TAB+3072, r1, r3 ## E)\
- round_xor(TAB+2048, r7, r4 ## E)\
+ round_xor(TAB, 3072, r1, r3 ## E)\
+ round_xor(TAB, 2048, r7, r4 ## E)\
movzbl r2 ## H,r1 ## E; \
movzbl r2 ## L,r2 ## E; \
xorl OFFSET+8(r8),rc ## E; \
xorl OFFSET+12(r8),rd ## E; \
- round_xor(TAB+1024, r1, r3 ## E)\
- round_xor(TAB, r2, r4 ## E)
+ round_xor(TAB, 1024, r1, r3 ## E)\
+ round_xor(TAB, 0, r2, r4 ## E)
#define move_regs(r1,r2,r3,r4) \
movl r3 ## E,r1 ## E; \
movl r4 ## E,r2 ## E;
-#define entry(FUNC,KEY,B128,B192) \
- prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11)
+#define entry(FUNC,KEY,B128,B192,TAB) \
+ prologue(FUNC,KEY,B128,B192,TAB,R2,R8,R1,R3,R4,R6,R10,R5,R11)
#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11)
@@ -161,7 +176,7 @@
/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
- entry(aes_enc_blk,0,.Le128,.Le192)
+ entry(aes_enc_blk,0,.Le128,.Le192,crypto_ft_tab)
encrypt_round(crypto_ft_tab,-96)
encrypt_round(crypto_ft_tab,-80)
.Le192: encrypt_round(crypto_ft_tab,-64)
@@ -175,12 +190,13 @@
encrypt_round(crypto_ft_tab, 64)
encrypt_round(crypto_ft_tab, 80)
encrypt_round(crypto_ft_tab, 96)
+ rbase_load(crypto_fl_tab)
encrypt_final(crypto_fl_tab,112)
return(aes_enc_blk)
/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
- entry(aes_dec_blk,240,.Ld128,.Ld192)
+ entry(aes_dec_blk,240,.Ld128,.Ld192,crypto_it_tab)
decrypt_round(crypto_it_tab,-96)
decrypt_round(crypto_it_tab,-80)
.Ld192: decrypt_round(crypto_it_tab,-64)
@@ -194,5 +210,6 @@
decrypt_round(crypto_it_tab, 64)
decrypt_round(crypto_it_tab, 80)
decrypt_round(crypto_it_tab, 96)
+ rbase_load(crypto_il_tab)
decrypt_final(crypto_il_tab,112)
return(aes_dec_blk)
diff -uprN a/arch/x86/crypto/camellia-x86_64-asm_64.S
b/arch/x86/crypto/camellia-x86_64-asm_64.S
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S 2019-03-16
10:50:57.093692118 -0400
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S 2019-03-20
19:42:23.627815384 -0400
@@ -21,6 +21,7 @@
*/
#include <linux/linkage.h>
+#include <asm/asm.h>
.file "camellia-x86_64-asm_64.S"
.text
@@ -92,10 +93,10 @@
#define RXORbl %r9b
#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
- leaq T0(%rip), tmp1; \
+ _ASM_LEA(T0, %rip, tmp1); \
movzbl ab ## bl, tmp2 ## d; \
xorq (tmp1, tmp2, 8), dst; \
- leaq T1(%rip), tmp2; \
+ _ASM_LEA(T1, %rip, tmp2); \
movzbl ab ## bh, tmp1 ## d; \
xorq (tmp2, tmp1, 8), dst; \
rorq $16, ab;
diff -uprN a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S 2019-03-16
10:50:57.093692118 -0400
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S 2019-03-20
19:42:23.627815384 -0400
@@ -25,6 +25,7 @@
#include <linux/linkage.h>
#include <asm/frame.h>
+#include <asm/asm.h>
.file "cast5-avx-x86_64-asm_64.S"
@@ -99,17 +100,17 @@
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
movzbl src ## bh, RID1d; \
- leaq s1(%rip), RID2; \
+ _ASM_LEA(s1, %rip, RID2); \
movl (RID2, RID1, 4), dst ## d; \
movzbl src ## bl, RID2d; \
- leaq s2(%rip), RID1; \
+ _ASM_LEA(s2, %rip, RID1); \
op1 (RID1, RID2, 4), dst ## d; \
shrq $16, src; \
movzbl src ## bh, RID1d; \
- leaq s3(%rip), RID2; \
+ _ASM_LEA(s3, %rip, RID2); \
op2 (RID2, RID1, 4), dst ## d; \
movzbl src ## bl, RID2d; \
- leaq s4(%rip), RID1; \
+ _ASM_LEA(s4, %rip, RID1); \
op3 (RID1, RID2, 4), dst ## d; \
interleave_op(il_reg);
diff -uprN a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S 2019-03-16
10:50:57.093692118 -0400
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S 2019-03-20
19:42:23.627815384 -0400
@@ -25,6 +25,7 @@
#include <linux/linkage.h>
#include <asm/frame.h>
+#include <asm/asm.h>
#include "glue_helper-asm-avx.S"
.file "cast6-avx-x86_64-asm_64.S"
@@ -99,17 +100,17 @@
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
movzbl src ## bh, RID1d; \
- leaq s1(%rip), RID2; \
+ _ASM_LEA(s1, %rip, RID2); \
movl (RID2, RID1, 4), dst ## d; \
movzbl src ## bl, RID2d; \
- leaq s2(%rip), RID1; \
+ _ASM_LEA(s2, %rip, RID1); \
op1 (RID1, RID2, 4), dst ## d; \
shrq $16, src; \
movzbl src ## bh, RID1d; \
- leaq s3(%rip), RID2; \
+ _ASM_LEA(s3, %rip, RID2); \
op2 (RID2, RID1, 4), dst ## d; \
movzbl src ## bl, RID2d; \
- leaq s4(%rip), RID1; \
+ _ASM_LEA(s4, %rip, RID1); \
op3 (RID1, RID2, 4), dst ## d; \
interleave_op(il_reg);
diff -uprN a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
--- a/arch/x86/include/asm/asm.h 2019-03-16 10:50:57.097692208 -0400
+++ b/arch/x86/include/asm/asm.h 2019-03-20 19:42:23.631815425 -0400
@@ -2,6 +2,48 @@
#ifndef _ASM_X86_ASM_H
#define _ASM_X86_ASM_H
+/*
+ * PIC modules require an indirection through GOT for
+ * external symbols. _ASM_CALL/_ASM_JMP for internal functions
+ * is optimized by replacing indirect calls with direct ones
+ * followed by 1-byte NOP paddings per a call site;
+ * Similarly, _ASM_LEA is optimized by replacing MOV
+ * to LEA and is used to load symbol addresses on x86-64.
+ * If RETPOLINE is enabled, use PLT stubs instead to
+ * avoid overheads for local calls.
+ */
+#if defined(MODULE) && defined(CONFIG_X86_PIC)
+# ifdef __ASSEMBLY__
+# define _ASM_LEA(v,r,a) movq v##@GOTPCREL(##r##), a
+# ifdef CONFIG_RETPOLINE
+# define _ASM_CALL(f) call f##@PLT
+# define _ASM_JMP(f) jmp f##@PLT
+# else
+# define _ASM_CALL(f) call *##f##@GOTPCREL(%rip)
+# define _ASM_JMP(f) jmp *##f##@GOTPCREL(%rip)
+# endif
+# else
+# define _ASM_LEA(v,r,a) "movq " #v "@GOTPCREL(" #r "), " #a
+# ifdef CONFIG_RETPOLINE
+# define _ASM_CALL(f) "call " #f "@PLT"
+# define _ASM_JMP(f) "jmp " #f "@PLT"
+# else
+# define _ASM_CALL(f) "call *" #f "@GOTPCREL(%%rip)"
+# define _ASM_JMP(f) "jmp *" #f "@GOTPCREL(%%rip)"
+# endif
+# endif
+#else
+# ifdef __ASSEMBLY__
+# define _ASM_CALL(f) call f
+# define _ASM_JMP(f) jmp f
+# define _ASM_LEA(v,r,a) leaq v##(##r##), a
+# else
+# define _ASM_CALL(f) "call " #f
+# define _ASM_JMP(f) "jmp " #f
+# define _ASM_LEA(v,r,a) "leaq " #v "(" #r "), " #a
+# endif
+#endif
+
#ifdef __ASSEMBLY__
# define __ASM_FORM(x) x
# define __ASM_FORM_RAW(x) x
@@ -118,6 +160,25 @@
# define CC_OUT(c) [_cc_ ## c] "=qm"
#endif
+/*
+ * PLT relocations in x86_64 PIC modules are already relative.
+ * However, due to inconsistent GNU binutils behavior (e.g., i386),
+ * avoid PLT relocations in all other cases (binutils bug 23997).
+ */
+#if defined(MODULE) && defined(CONFIG_X86_PIC)
+# ifdef __ASSEMBLY__
+# define _ASM_HANDLER(x) x##@PLT
+# else
+# define _ASM_HANDLER(x) x "@PLT"
+# endif
+#else
+# ifdef __ASSEMBLY__
+# define _ASM_HANDLER(x) (x) - .
+# else
+# define _ASM_HANDLER(x) "(" x ") - ."
+# endif
+#endif
+
/* Exception table entry */
#ifdef __ASSEMBLY__
# define _ASM_EXTABLE_HANDLE(from, to, handler) \
@@ -125,7 +186,7 @@
.balign 4 ; \
.long (from) - . ; \
.long (to) - . ; \
- .long (handler) - . ; \
+ .long _ASM_HANDLER(handler); \
.popsection
# define _ASM_EXTABLE(from, to) \
@@ -174,13 +235,13 @@
.endm
#else
-# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _EXPAND_EXTABLE_HANDLE(x) _ASM_HANDLER(#x)
# define _ASM_EXTABLE_HANDLE(from, to, handler) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
- " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \
+ " .long " _EXPAND_EXTABLE_HANDLE(handler) "\n" \
" .popsection\n"
# define _ASM_EXTABLE(from, to) \
diff -uprN a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
--- a/arch/x86/kernel/kvm.c 2019-03-16 10:50:57.101692298 -0400
+++ b/arch/x86/kernel/kvm.c 2019-03-20 19:42:23.635815466 -0400
@@ -826,10 +826,12 @@ asm(
".global __raw_callee_save___kvm_vcpu_is_preempted;"
".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
"__raw_callee_save___kvm_vcpu_is_preempted:"
-"leaq __per_cpu_offset(%rip), %rax;"
+"pushq %rdi;"
+_ASM_LEA(__per_cpu_offset, %rip, %rax) ";"
"movq (%rax,%rdi,8), %rax;"
-"addq " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rip), %rax;"
-"cmpb $0, (%rax);"
+"leaq " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rip), %rdi;"
+"cmpb $0, (%rax,%rdi,1);"
+"popq %rdi;"
"setne %al;"
"ret;"
".popsection");
diff -uprN a/lib/zstd/entropy_common_dec.c b/lib/zstd/entropy_common_dec.c
--- a/lib/zstd/entropy_common_dec.c 1969-12-31 19:00:00.000000000 -0500
+++ b/lib/zstd/entropy_common_dec.c 2019-03-20 19:42:23.635815466 -0400
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
+#include "entropy_common.c"
diff -uprN a/lib/zstd/fse_decompress_dec.c b/lib/zstd/fse_decompress_dec.c
--- a/lib/zstd/fse_decompress_dec.c 1969-12-31 19:00:00.000000000 -0500
+++ b/lib/zstd/fse_decompress_dec.c 2019-03-20 19:42:23.635815466 -0400
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
+#include "fse_decompress.c"
diff -uprN a/lib/zstd/Makefile b/lib/zstd/Makefile
--- a/lib/zstd/Makefile 2019-03-13 17:01:32.000000000 -0400
+++ b/lib/zstd/Makefile 2019-03-20 19:42:23.635815466 -0400
@@ -6,4 +6,4 @@ ccflags-y += -O3
zstd_compress-y := fse_compress.o huf_compress.o compress.o \
entropy_common.o fse_decompress.o zstd_common.o
zstd_decompress-y := huf_decompress.o decompress.o \
- entropy_common.o fse_decompress.o zstd_common.o
+ entropy_common_dec.o fse_decompress_dec.o zstd_common_dec.o
diff -uprN a/lib/zstd/zstd_common_dec.c b/lib/zstd/zstd_common_dec.c
--- a/lib/zstd/zstd_common_dec.c 1969-12-31 19:00:00.000000000 -0500
+++ b/lib/zstd/zstd_common_dec.c 2019-03-20 19:42:23.635815466 -0400
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+#include "zstd_common.c"