On Tue, 25 Jun 2024, Mike Snitzer wrote: > From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > > For localio access, don't call filesystem read() and write() routines > directly. > > Some filesystem writeback routines can end up taking up a lot of stack > space (particularly xfs). Instead of risking running over due to the > extra overhead from the NFS stack, we should just call these routines > from a workqueue job. > > Use of dedicated workqueues improves performance over using the > system_unbound_wq. Localio is motivated by the promise of improved > performance, it makes little sense to yield it back. > > But further analysis of the latest stack depth requirements would be > useful. It'd be nice to root cause and fix the latest stack hogs, > because using workqueues at all can cause a loss in performance due to > context switches. > I would rather this patch were left out of the final submission, and only added if/when we have documented evidence that it helps. Thanks, NeilBrown > Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx> > --- > fs/nfs/inode.c | 57 +++++++++++++++++--------- > fs/nfs/internal.h | 1 + > fs/nfs/localio.c | 102 +++++++++++++++++++++++++++++++++++----------- > 3 files changed, 118 insertions(+), 42 deletions(-) > > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c > index f9923cbf6058..aac8c5302503 100644 > --- a/fs/nfs/inode.c > +++ b/fs/nfs/inode.c > @@ -2394,35 +2394,54 @@ static void nfs_destroy_inodecache(void) > kmem_cache_destroy(nfs_inode_cachep); > } > > +struct workqueue_struct *nfslocaliod_workqueue; > struct workqueue_struct *nfsiod_workqueue; > EXPORT_SYMBOL_GPL(nfsiod_workqueue); > > /* > - * start up the nfsiod workqueue > - */ > -static int nfsiod_start(void) > -{ > - struct workqueue_struct *wq; > - dprintk("RPC: creating workqueue nfsiod\n"); > - wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); > - if (wq == NULL) > - return -ENOMEM; > - nfsiod_workqueue = wq; > - return 0; > -} > - > -/* > - * Destroy the nfsiod workqueue > + * Destroy the nfsiod workqueues > */ > static void nfsiod_stop(void) > { > struct workqueue_struct *wq; > > wq = nfsiod_workqueue; > - if (wq == NULL) > - return; > - nfsiod_workqueue = NULL; > - destroy_workqueue(wq); > + if (wq != NULL) { > + nfsiod_workqueue = NULL; > + destroy_workqueue(wq); > + } > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > + wq = nfslocaliod_workqueue; > + if (wq != NULL) { > + nfslocaliod_workqueue = NULL; > + destroy_workqueue(wq); > + } > +#endif /* CONFIG_NFS_LOCALIO */ > +} > + > +/* > + * Start the nfsiod workqueues > + */ > +static int nfsiod_start(void) > +{ > + dprintk("RPC: creating workqueue nfsiod\n"); > + nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); > + if (nfsiod_workqueue == NULL) > + return -ENOMEM; > +#if IS_ENABLED(CONFIG_NFS_LOCALIO) > + /* > + * localio writes need to use a normal (non-memreclaim) workqueue. > + * When we start getting low on space, XFS goes and calls flush_work() on > + * a non-memreclaim work queue, which causes a priority inversion problem. > + */ > + dprintk("RPC: creating workqueue nfslocaliod\n"); > + nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0); > + if (unlikely(nfslocaliod_workqueue == NULL)) { > + nfsiod_stop(); > + return -ENOMEM; > + } > +#endif /* CONFIG_NFS_LOCALIO */ > + return 0; > } > > unsigned int nfs_net_id; > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h > index d352040e3232..9251a357d097 100644 > --- a/fs/nfs/internal.h > +++ b/fs/nfs/internal.h > @@ -440,6 +440,7 @@ int nfs_check_flags(int); > > /* inode.c */ > extern struct workqueue_struct *nfsiod_workqueue; > +extern struct workqueue_struct *nfslocaliod_workqueue; > extern struct inode *nfs_alloc_inode(struct super_block *sb); > extern void nfs_free_inode(struct inode *); > extern int nfs_write_inode(struct inode *, struct writeback_control *); > diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c > index 724e81716b16..418b8d76692b 100644 > --- a/fs/nfs/localio.c > +++ b/fs/nfs/localio.c > @@ -44,6 +44,12 @@ struct nfs_local_fsync_ctx { > }; > static void nfs_local_fsync_work(struct work_struct *work); > > +struct nfs_local_io_args { > + struct nfs_local_kiocb *iocb; > + struct work_struct work; > + struct completion *done; > +}; > + > /* > * We need to translate between nfs status return values and > * the local errno values which may not be the same. > @@ -307,30 +313,54 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) > status > 0 ? status : 0, hdr->res.eof); > } > > -static int > -nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp, > - const struct rpc_call_ops *call_ops) > +static void nfs_local_call_read(struct work_struct *work) > { > - struct nfs_local_kiocb *iocb; > + struct nfs_local_io_args *args = > + container_of(work, struct nfs_local_io_args, work); > + struct nfs_local_kiocb *iocb = args->iocb; > + struct file *filp = iocb->kiocb.ki_filp; > struct iov_iter iter; > ssize_t status; > > + nfs_local_iter_init(&iter, iocb, READ); > + > + status = filp->f_op->read_iter(&iocb->kiocb, &iter); > + if (status != -EIOCBQUEUED) { > + nfs_local_read_done(iocb, status); > + nfs_local_pgio_release(iocb); > + } > + complete(args->done); > +} > + > +static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp, > + const struct rpc_call_ops *call_ops) > +{ > + struct nfs_local_io_args args; > + DECLARE_COMPLETION_ONSTACK(done); > + struct nfs_local_kiocb *iocb; > + > dprintk("%s: vfs_read count=%u pos=%llu\n", > __func__, hdr->args.count, hdr->args.offset); > > iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL); > if (iocb == NULL) > return -ENOMEM; > - nfs_local_iter_init(&iter, iocb, READ); > > nfs_local_pgio_init(hdr, call_ops); > hdr->res.eof = false; > > - status = filp->f_op->read_iter(&iocb->kiocb, &iter); > - if (status != -EIOCBQUEUED) { > - nfs_local_read_done(iocb, status); > - nfs_local_pgio_release(iocb); > - } > + /* > + * Don't call filesystem read() routines directly. > + * In order to avoid issues with stack overflow, > + * call the read routines from a workqueue job. > + */ > + args.iocb = iocb; > + args.done = &done; > + INIT_WORK_ONSTACK(&args.work, nfs_local_call_read); > + queue_work(nfslocaliod_workqueue, &args.work); > + wait_for_completion(&done); > + destroy_work_on_stack(&args.work); > + > return 0; > } > > @@ -420,14 +450,35 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) > nfs_local_pgio_done(hdr, status); > } > > -static int > -nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp, > - const struct rpc_call_ops *call_ops) > +static void nfs_local_call_write(struct work_struct *work) > { > - struct nfs_local_kiocb *iocb; > + struct nfs_local_io_args *args = > + container_of(work, struct nfs_local_io_args, work); > + struct nfs_local_kiocb *iocb = args->iocb; > + struct file *filp = iocb->kiocb.ki_filp; > struct iov_iter iter; > ssize_t status; > > + nfs_local_iter_init(&iter, iocb, WRITE); > + > + file_start_write(filp); > + status = filp->f_op->write_iter(&iocb->kiocb, &iter); > + file_end_write(filp); > + if (status != -EIOCBQUEUED) { > + nfs_local_write_done(iocb, status); > + nfs_get_vfs_attr(filp, iocb->hdr->res.fattr); > + nfs_local_pgio_release(iocb); > + } > + complete(args->done); > +} > + > +static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp, > + const struct rpc_call_ops *call_ops) > +{ > + struct nfs_local_io_args args; > + DECLARE_COMPLETION_ONSTACK(done); > + struct nfs_local_kiocb *iocb; > + > dprintk("%s: vfs_write count=%u pos=%llu %s\n", > __func__, hdr->args.count, hdr->args.offset, > (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); > @@ -435,7 +486,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp, > iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO); > if (iocb == NULL) > return -ENOMEM; > - nfs_local_iter_init(&iter, iocb, WRITE); > > switch (hdr->args.stable) { > default: > @@ -450,14 +500,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp, > > nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable); > > - file_start_write(filp); > - status = filp->f_op->write_iter(&iocb->kiocb, &iter); > - file_end_write(filp); > - if (status != -EIOCBQUEUED) { > - nfs_local_write_done(iocb, status); > - nfs_get_vfs_attr(filp, hdr->res.fattr); > - nfs_local_pgio_release(iocb); > - } > + /* > + * Don't call filesystem write() routines directly. > + * Some filesystem writeback routines can end up taking up a lot of > + * stack (particularly xfs). Instead of risking running over due to > + * the extra overhead from the NFS stack, call these write routines > + * from a workqueue job. > + */ > + args.iocb = iocb; > + args.done = &done; > + INIT_WORK_ONSTACK(&args.work, nfs_local_call_write); > + queue_work(nfslocaliod_workqueue, &args.work); > + wait_for_completion(&done); > + destroy_work_on_stack(&args.work); > + > return 0; > } > > -- > 2.44.0 > >