copy_process currently copies task_struct.posix_cputimers_work as-is. If a timer interrupt arrives while handling clone and before dup_task_struct completes then the child task will have: 1. posix_cputimers_work.scheduled = true 2. posix_cputimers_work.work queued. copy_process clears task_struct.task_works, so (2) will have no effect and posix_cpu_timers_work will never run (not to mention it doesn't make sense for two tasks to share a common linked list). Since posix_cpu_timers_work never runs, posix_cputimers_work.scheduled is never cleared. Since scheduled is set, future timer interrupts will skip scheduling work, with the ultimate result that the task will never receive timer expirations. Together, the complete flow is: 1. Task 1 calls clone(), enters kernel. 2. Timer interrupt fires, schedules task work on Task 1. 2a. task_struct.posix_cputimers_work.scheduled = true 2b. task_struct.posix_cputimers_work.work added to task_struct.task_works. 3. dup_task_struct copies Task 1 to Task 2. 4. copy_process clears task_struct.task_works for Task 2. 5. Future timer interrupts on Task 2 see task_struct.posix_cputimers_work.scheduled = true and skip scheduling work. Fix this by explicitly clearing contents of task_struct.posix_cputimers_work in copy_process. This was never meant to be shared or inherited across tasks in the first place. Signed-off-by: Michael Pratt <mpratt@xxxxxxxxxx> Reported-by: Rhys Hiltner <rhys@xxxxxxxxx> Fixes: 1fb497dd0030 ("posix-cpu-timers: Provide mechanisms to defer timer handling to task_work") Cc: <stable@xxxxxxxxxxxxxxx> --- This issue was discovered while investigating a flaky test in the Go language standard libary, https://golang.org/issue/49065. After our testing VMs upgraded from 5.4 to 5.10 kernels, several profiling tests started failing ~1% of the time with threads not receiving their expected profiling signals. Bisection of problem by Rhys blamed b6b178e38f40 ("Merge tag 'timers-core-2020-08-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip"). This merge commit introduced the broken commit 1fb497dd0030 ("posix-cpu-timers: Provide mechanisms to defer timer handling to task_work") and its child 0099808553ad ("x86: Select POSIX_CPU_TIMERS_TASK_WORK"), which enables the new codepath. The C program below also reproduces the problem. Build with `gcc repro.c -lrt -pthread -O2`. The program starts a CPU timer on the main thread, which then spawns child threads that create their own CPU timers and verify that they receive timer signals. At HEAD and 0099808553ad this program fails with ~3-15 / 20000 threads not receiving signals. Prior to 0099808553ad and with this patch, the program reports no failures. // SPDX-License-Identifier: GPL-2.0 #include <pthread.h> #include <signal.h> #include <stdint.h> #include <stdio.h> #include <sys/syscall.h> #include <time.h> #include <unistd.h> __thread uint64_t signaled; _Atomic int threads_bad; void signal_handler(int signo, siginfo_t *siginfo, void *uctx) { signaled++; } int gettid(void) { return syscall(SYS_gettid); } timer_t setup_timer(void) { struct sigevent sev = { .sigev_signo = SIGPROF, .sigev_notify = SIGEV_THREAD_ID, ._sigev_un = { ._tid = gettid(), }, }; struct itimerspec spec = { .it_interval = { .tv_nsec = 10*1000*1000, /* 10ms */ }, .it_value = { .tv_nsec = 10*1000*1000, /* 10ms */ }, }; timer_t timerid; int ret; ret = timer_create(CLOCK_THREAD_CPUTIME_ID, &sev, &timerid); if (ret != 0) { perror("timer_create"); _exit(1); } ret = timer_settime(timerid, 0, &spec, NULL); if (ret != 0) { perror("timer_settime"); _exit(1); } return timerid; } uint64_t thread_cpu_ns(void) { struct timespec ts; int ret; ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); if (ret != 0) { perror("clock_gettime"); _exit(1); } return ts.tv_nsec + 1000*1000*1000*ts.tv_sec; } void *thread(void *arg) { timer_t timerid; uint64_t start; int ret; timerid = setup_timer(); start = thread_cpu_ns(); while (1) { uint64_t now; /* 50ms passed? */ now = thread_cpu_ns(); if (now - start > 50*1000*1000) break; /* Busy loop */ for (volatile int i = 0; i < 100000; i++) ; } /* * 50ms passed; we should certainly have received some profiling * signals. */ if (signaled == 0) { printf("Thread %d received no profiling signals!\n", gettid()); threads_bad++; } ret = timer_delete(timerid); if (ret != 0) { perror("timer_delete"); _exit(1); } return NULL; } int main(void) { struct sigaction sa = { .sa_sigaction = &signal_handler, .sa_flags = SA_SIGINFO | SA_RESTART, }; int ret; sigset_t set; timer_t timerid; int bad; int thread_count = 0; ret = sigaction(SIGPROF, &sa, NULL); if (ret != 0) { perror("sigaction"); return 1; } sigemptyset(&set); sigaddset(&set, SIGPROF); ret = sigprocmask(SIG_UNBLOCK, &set, NULL); if (ret != 0) { perror("sigprocmask"); return 1; } timerid = setup_timer(); while (thread_count < 20000) { pthread_t threads[10]; for (int i = 0; i < 10; i++) { ret = pthread_create(&threads[i], NULL, &thread, NULL); if (ret != 0) { perror("pthread_create"); return 1; } thread_count++; } /* Busy loop */ for (volatile int i = 0; i < 100000; i++) ; for (int i = 0; i < 10; i++) { ret = pthread_join(threads[i], NULL); if (ret != 0) { perror("pthread_join"); return 1; } } if (thread_count % 100 == 0) printf("%d threads\n", thread_count); } bad = threads_bad; printf("Bad threads %d / %d = %f%%\n", threads_bad, thread_count, 100*((double)threads_bad) / ((double)thread_count)); if (threads_bad > 0) return 1; return 0; } include/linux/posix-timers.h | 2 ++ kernel/fork.c | 1 + kernel/time/posix-cpu-timers.c | 19 +++++++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 00fef0064355..5bbcd280bfd2 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -184,8 +184,10 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct, #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +void clear_posix_cputimers_work(struct task_struct *p); void posix_cputimers_init_work(void); #else +static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 38681ad44c76..b1551c074b74 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2280,6 +2280,7 @@ static __latent_entropy struct task_struct *copy_process( p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + clear_posix_cputimers_work(p); #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 643d412ac623..96b4e7810426 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1158,14 +1158,29 @@ static void posix_cpu_timers_work(struct callback_head *work) handle_posix_cpu_timers(current); } +/* + * Clear existing posix CPU timers task work. + */ +void clear_posix_cputimers_work(struct task_struct *p) +{ + /* + * A copied work entry from the old task is not meaningful, clear it. + * N.B. init_task_work will not do this. + */ + memset(&p->posix_cputimers_work.work, 0, + sizeof(p->posix_cputimers_work.work)); + init_task_work(&p->posix_cputimers_work.work, + posix_cpu_timers_work); + p->posix_cputimers_work.scheduled = false; +} + /* * Initialize posix CPU timers task work in init task. Out of line to * keep the callback static and to avoid header recursion hell. */ void __init posix_cputimers_init_work(void) { - init_task_work(¤t->posix_cputimers_work.work, - posix_cpu_timers_work); + clear_posix_cputimers_work(current); } /* -- 2.33.1.1089.g2158813163f-goog