On Thu, Jun 23, 2016 at 12:17 PM, Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote: > > With the goal being that I'm hoping that we can then actually get rid > of this (at least on x86-64, even if we leave it in some other > architectures) in 4.8. The context here was that we could almost get rid of thread-info entirely, at least for x86-64, by moving it into struct task_struct. It turns out that we're not *that* far off after the obvious cleanups I already committed, but I couldn't get things quite to work. I'm attaching a patch that I wrote today that doesn't boot, but "looks right". The reason I'm attaching it is because I'm hoping somebody wants to take a look and maybe see what else I missed, but mostly because I think the patch is interesting in a couple of cases where we just do incredibly ugly things. First off, some code that Andy wrote when he re-organized the entry path. Oh Gods, Andy. That pt_regs_to_thread_info() thing made me want to do unspeakable acts on a poor innocent wax figure that looked _exactly_ like you. I just got rid of pt_regs_to_thread_info() entirely, and just replaced it with current_thread_info(). I'm not at all convinced that trying to be that clever was really a good idea. Secondly, the x86-64 ret_from_fork calling convention was documented wrongly. It says %rdi contains the previous task pointer. Yes it does, but it doesn't mention that %r8 is supposed to contain the new thread_info. That was fun to find. And thirdly, the stack size games that asm/kprobes.h plays are just disgusting. I stared at that code for much too long. I may in fact be going blind as a result. The rest was fairly straightforward, although since the end result doesn't actually work, that "straightforward" may be broken too. But the basic approach _looks_ sane. Comments? Anybody want to play with this and see where I went wrong? (Note - this patch was written on top of the two thread-info removal patches I committed in da01e18a37a5 x86: avoid avoid passing around 'thread_info' in stack dumping code 6720a305df74 locking: avoid passing around 'thread_info' in mutex debugging code and depends on them, since "ti->task" no longer exists with CONFIG_THREAD_INFO_IN_TASK. "ti" and "task" will have the same value). Linus
This is a non-working attempt at moving the thread_info into the task_struct arch/x86/Kconfig | 1 + arch/x86/entry/common.c | 21 +++++++-------------- arch/x86/entry/entry_64.S | 9 ++++++--- arch/x86/include/asm/kprobes.h | 12 ++++++------ arch/x86/include/asm/switch_to.h | 6 ++---- arch/x86/include/asm/thread_info.h | 38 ++++---------------------------------- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/irq_32.c | 2 -- arch/x86/kernel/irq_64.c | 3 +-- arch/x86/kernel/process.c | 6 ++---- arch/x86/um/ptrace_32.c | 8 ++++---- include/linux/init_task.h | 9 +++++++++ include/linux/sched.h | 14 +++++++++++++- init/Kconfig | 3 +++ init/init_task.c | 7 +++++-- 15 files changed, 64 insertions(+), 77 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d9a94da0c29f..f33bc80577c5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -154,6 +154,7 @@ config X86 select SPARSE_IRQ select SRCU select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select X86_DEV_DMA_OPS if X86_64 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ec138e538c44..d5feac5f252d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -31,13 +31,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> -static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) -{ - unsigned long top_of_stack = - (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; - return (struct thread_info *)(top_of_stack - THREAD_SIZE); -} - #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ __visible void enter_from_user_mode(void) @@ -78,7 +71,7 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) */ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long ret = 0; u32 work; @@ -156,7 +149,7 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, unsigned long phase1_result) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); long ret = 0; u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; @@ -239,7 +232,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Disable IRQs and retry */ local_irq_disable(); - cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(current_thread_info()->flags); if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) break; @@ -250,7 +243,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -309,7 +302,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) */ __visible inline void syscall_return_slowpath(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags = READ_ONCE(ti->flags); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -332,7 +325,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) #ifdef CONFIG_X86_64 __visible void do_syscall_64(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long nr = regs->orig_ax; enter_from_user_mode(); @@ -365,7 +358,7 @@ __visible void do_syscall_64(struct pt_regs *regs) */ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9ee0da1807ed..f49742de2c65 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -179,7 +179,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + GET_THREAD_INFO(%r11) + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TI_flags(%r11) jnz entry_SYSCALL64_slow_path entry_SYSCALL_64_fastpath: @@ -217,7 +218,8 @@ entry_SYSCALL_64_fastpath: */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + GET_THREAD_INFO(%r11) + testl $_TIF_ALLWORK_MASK, TI_flags(%r11) jnz 1f LOCKDEP_SYS_EXIT @@ -368,9 +370,10 @@ END(ptregs_\func) * A newly forked process directly context switches into this address. * * rdi: prev task we switched from + * rsi: task we're switching to */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) + LOCK ; btr $TIF_FORK, TI_flags(%rsi) /* rsi: this newly forked task */ call schedule_tail /* rdi: 'prev' task parameter */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4421b5da409d..1d2997e74b08 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -38,12 +38,12 @@ typedef u8 kprobe_opcode_t; #define RELATIVECALL_OPCODE 0xe8 #define RELATIVE_ADDR_SIZE 4 #define MAX_STACK_SIZE 64 -#define MIN_STACK_SIZE(ADDR) \ - (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \ - THREAD_SIZE - (unsigned long)(ADDR))) \ - ? (MAX_STACK_SIZE) \ - : (((unsigned long)current_thread_info()) + \ - THREAD_SIZE - (unsigned long)(ADDR))) + +#define current_stack_top() ((unsigned long)task_stack_page(current)+THREAD_SIZE) +#define current_stack_size(ADDR) (current_stack_top() - (unsigned long)(ADDR)) + +#define MIN_STACK_SIZE(ADDR) \ + (MAX_STACK_SIZE < current_stack_size(ADDR) ? MAX_STACK_SIZE : current_stack_size(ADDR)) #define flush_insn_slot(p) do { } while (0) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8f321a1b03a1..ae0aa0612c67 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -110,18 +110,16 @@ do { \ "call __switch_to\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%rsi)\n\t" \ "jnz ret_from_fork\n\t" \ RESTORE_CONTEXT \ : "=a" (last) \ __switch_canary_oparam \ : [next] "S" (next), [prev] "D" (prev), \ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [ti_flags] "i" (offsetof(struct task_struct, thread_info.flags)), \ [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ [current_task] "m" (current_task) \ __switch_canary_iparam \ : "memory", "cc" __EXTRA_CLOBBER) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 30c133ac05cd..eef687fdc90d 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -53,24 +53,22 @@ struct task_struct; #include <linux/atomic.h> struct thread_info { - struct task_struct *task; /* main task structure */ __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - mm_segment_t addr_limit; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ + mm_segment_t addr_limit; }; #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ .flags = 0, \ .cpu = 0, \ .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) +#define init_thread_info (init_task.thread_info) #define init_stack (init_thread_union.stack) #else /* !__ASSEMBLY__ */ @@ -166,7 +164,7 @@ struct thread_info { static inline struct thread_info *current_thread_info(void) { - return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); + return (struct thread_info *)current; } static inline unsigned long current_stack_pointer(void) @@ -188,35 +186,7 @@ static inline unsigned long current_stack_pointer(void) /* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE),reg ; - -/* - * ASM operand which evaluates to a 'thread_info' address of - * the current task, if it is known that "reg" is exactly "off" - * bytes below the top of the stack currently. - * - * ( The kernel stack's size is known at build time, it is usually - * 2 or 4 pages, and the bottom of the kernel stack contains - * the thread_info structure. So to access the thread_info very - * quickly from assembly code we can calculate down from the - * top of the kernel stack to the bottom, using constant, - * build-time calculations only. ) - * - * For example, to fetch the current thread_info->flags value into %eax - * on x86-64 defconfig kernels, in syscall entry code where RSP is - * currently at exactly SIZEOF_PTREGS bytes away from the top of the - * stack: - * - * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax - * - * will translate to: - * - * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax - * - * which is below the current RSP by almost 16K. - */ -#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) + _ASM_MOV PER_CPU_VAR(current_task),reg #endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index d6209f3a69cb..ef8017ca5ba9 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -80,7 +80,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data, static inline int valid_stack_ptr(struct task_struct *task, void *p, unsigned int size, void *end) { - void *t = task_thread_info(task); + void *t = task_stack_page(task); if (end) { if (p < end && p >= (end-THREAD_SIZE)) return 1; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 38da8f29a9c8..c627bf8d98ad 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -130,11 +130,9 @@ void irq_ctx_init(int cpu) void do_softirq_own_stack(void) { - struct thread_info *curstk; struct irq_stack *irqstk; u32 *isp, *prev_esp; - curstk = current_stack(); irqstk = __this_cpu_read(softirq_stack); /* build the stack frame on the softirq stack */ diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 206d0b90a3ab..38f9f5678dc8 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -41,8 +41,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 96becbbb52e0..8f60f810a9e7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -536,9 +536,7 @@ unsigned long get_wchan(struct task_struct *p) * PADDING * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start + * ----------- bottom = start * * The tasks stack pointer points at the location where the * framepointer is stored. The data on the stack is: @@ -549,7 +547,7 @@ unsigned long get_wchan(struct task_struct *p) */ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); + bottom = start; sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index ebd4dd6ef73b..14e8f6a628c2 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -191,7 +191,7 @@ int peek_user(struct task_struct *child, long addr, long data) static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_thread_info(child)->cpu; struct user_i387_struct fpregs; err = save_i387_registers(userspace_pid[cpu], @@ -208,7 +208,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_thread_info(child)->cpu; struct user_i387_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); @@ -221,7 +221,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_thread_info(child)->cpu; struct user_fxsr_struct fpregs; err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs); @@ -237,7 +237,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct * static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_thread_info(child)->cpu; struct user_fxsr_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f2cb8d45513d..a00f53b64c09 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -15,6 +15,8 @@ #include <net/net_namespace.h> #include <linux/sched/rt.h> +#include <asm/thread_info.h> + #ifdef CONFIG_SMP # define INIT_PUSHABLE_TASKS(tsk) \ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), @@ -183,12 +185,19 @@ extern struct task_group root_task_group; # define INIT_KASAN(tsk) #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK +# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk), +#else +# define INIT_TASK_TI(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ #define INIT_TASK(tsk) \ { \ + INIT_TASK_TI(tsk) \ .state = 0, \ .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..06236a36ba17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1456,6 +1456,9 @@ struct tlbflush_unmap_batch { }; struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK + struct thread_info thread_info; +#endif volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; @@ -2539,7 +2542,9 @@ extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { +#ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; +#endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; @@ -2967,7 +2972,14 @@ static inline void threadgroup_change_end(struct task_struct *tsk) cgroup_threadgroup_change_end(tsk); } -#ifndef __HAVE_THREAD_FUNCTIONS +#ifdef CONFIG_THREAD_INFO_IN_TASK + +#define task_thread_info(task) (&(task)->thread_info) +#define task_stack_page(task) ((task)->stack) +#define setup_thread_stack(new,old) do { } while(0) +#define end_of_stack(task) ((unsigned long *)task_stack_page(task)) + +#elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_thread_info(task) ((struct thread_info *)(task)->stack) #define task_stack_page(task) ((task)->stack) diff --git a/init/Kconfig b/init/Kconfig index f755a602d4a1..0c83af6d3753 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -26,6 +26,9 @@ config IRQ_WORK config BUILDTIME_EXTABLE_SORT bool +config THREAD_INFO_IN_TASK + bool + menu "General setup" config BROKEN diff --git a/init/init_task.c b/init/init_task.c index ba0a7f362d9e..11f83be1fa79 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task); * Initial thread structure. Alignment of this is handled by a special * linker map entry. */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = { +#ifndef CONFIG_THREAD_INFO_IN_TASK + INIT_THREAD_INFO(init_task) +#endif +};