On Tue, Nov 28, 2023 at 09:05:21AM +1100, NeilBrown wrote: > > I have evidence from a customer site of 256 nfsd threads adding files to > delayed_fput_lists nearly twice as fast they are retired by a single > work-queue thread running delayed_fput(). As you might imagine this > does not end well (20 million files in the queue at the time a snapshot > was taken for analysis). > > While this might point to a problem with the filesystem not handling the > final close efficiently, such problems should only hurt throughput, not > lead to memory exhaustion. > > For normal threads, the thread that closes the file also calls the > final fput so there is natural rate limiting preventing excessive growth > in the list of delayed fputs. For kernel threads, and particularly for > nfsd, delayed in the final fput do not impose any throttling to prevent > the thread from closing more files. > > A simple way to fix this is to treat nfsd threads like normal processes > for task_work. Thus the pending files are queued for the thread, and > the same thread finishes the work. > > Currently KTHREADs are assumed never to call task_work_run(). With this > patch that it still the default but it is implemented by storing the > magic value TASK_WORKS_DISABLED in ->task_works. If a kthread, such as > nfsd, will call task_work_run() periodically, it sets ->task_works > to NULL to indicate this. > > Signed-off-by: NeilBrown <neilb@xxxxxxx> > --- > > I wonder which tree this should go through assuming everyone likes it. > VFS maybe?? Sure. > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 292c31697248..c63c2bedbf71 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1117,6 +1117,7 @@ struct task_struct { > unsigned int sas_ss_flags; > > struct callback_head *task_works; > +#define TASK_WORKS_DISABLED ((void*)1) Should be simpler if you invert the logic? COMPLETELY UNTESTED --- fs/file_table.c | 2 +- fs/nfsd/nfssvc.c | 4 ++++ include/linux/task_work.h | 3 +++ kernel/fork.c | 3 +++ kernel/task_work.c | 12 ++++++++++++ 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index de4a2915bfd4..e79351df22be 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -445,7 +445,7 @@ void fput(struct file *file) if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + if (likely(!in_interrupt())) { init_task_work(&file->f_rcuhead, ____fput); if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) return; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d6122bb2d167..cea76bad3a95 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -13,6 +13,7 @@ #include <linux/fs_struct.h> #include <linux/swap.h> #include <linux/siphash.h> +#include <linux/task_work.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> @@ -943,6 +944,7 @@ nfsd(void *vrqstp) current->fs->umask = 0; + task_work_manage(current); /* Declare that I will call task_work_run() */ atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -956,6 +958,8 @@ nfsd(void *vrqstp) svc_recv(rqstp); validate_process_creds(); + if (task_work_pending(current)) + task_work_run(); } atomic_dec(&nfsdstats.th_cnt); diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 795ef5a68429..645fb94e47e0 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -20,6 +20,9 @@ enum task_work_notify_mode { TWA_SIGNAL_NO_IPI, }; +void task_work_off(struct task_struct *task); +void task_work_manage(struct task_struct *task); + static inline bool task_work_pending(struct task_struct *task) { return READ_ONCE(task->task_works); diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..348ed8fa9333 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2346,6 +2346,9 @@ __latent_entropy struct task_struct *copy_process( if (args->io_thread) p->flags |= PF_IO_WORKER; + if (args->kthread) + task_work_off(p); + if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); diff --git a/kernel/task_work.c b/kernel/task_work.c index 95a7e1b7f1da..2ae948d0d124 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -5,6 +5,18 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +void task_work_off(struct task_struct *task) +{ + task->task_works = &work_exited; +} +EXPORT_SYMBOL(task_work_off); + +void task_work_manage(struct task_struct *task) +{ + task->task_works = NULL; +} +EXPORT_SYMBOL(task_work_manage); + /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback -- 2.42.0