Re: [PATCH v7 10/20] nfs/localio: use dedicated workqueues for filesystem read and write

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 25 Jun 2024, Mike Snitzer wrote:
> From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> 
> For localio access, don't call filesystem read() and write() routines
> directly.
> 
> Some filesystem writeback routines can end up taking up a lot of stack
> space (particularly xfs). Instead of risking running over due to the
> extra overhead from the NFS stack, we should just call these routines
> from a workqueue job.
> 
> Use of dedicated workqueues improves performance over using the
> system_unbound_wq. Localio is motivated by the promise of improved
> performance, it makes little sense to yield it back.
> 
> But further analysis of the latest stack depth requirements would be
> useful. It'd be nice to root cause and fix the latest stack hogs,
> because using workqueues at all can cause a loss in performance due to
> context switches.
> 

I would rather this patch were left out of the final submission, and
only added if/when we have documented evidence that it helps.

Thanks,
NeilBrown


> Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx>


> ---
>  fs/nfs/inode.c    |  57 +++++++++++++++++---------
>  fs/nfs/internal.h |   1 +
>  fs/nfs/localio.c  | 102 +++++++++++++++++++++++++++++++++++-----------
>  3 files changed, 118 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index f9923cbf6058..aac8c5302503 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -2394,35 +2394,54 @@ static void nfs_destroy_inodecache(void)
>  	kmem_cache_destroy(nfs_inode_cachep);
>  }
>  
> +struct workqueue_struct *nfslocaliod_workqueue;
>  struct workqueue_struct *nfsiod_workqueue;
>  EXPORT_SYMBOL_GPL(nfsiod_workqueue);
>  
>  /*
> - * start up the nfsiod workqueue
> - */
> -static int nfsiod_start(void)
> -{
> -	struct workqueue_struct *wq;
> -	dprintk("RPC:       creating workqueue nfsiod\n");
> -	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
> -	if (wq == NULL)
> -		return -ENOMEM;
> -	nfsiod_workqueue = wq;
> -	return 0;
> -}
> -
> -/*
> - * Destroy the nfsiod workqueue
> + * Destroy the nfsiod workqueues
>   */
>  static void nfsiod_stop(void)
>  {
>  	struct workqueue_struct *wq;
>  
>  	wq = nfsiod_workqueue;
> -	if (wq == NULL)
> -		return;
> -	nfsiod_workqueue = NULL;
> -	destroy_workqueue(wq);
> +	if (wq != NULL) {
> +		nfsiod_workqueue = NULL;
> +		destroy_workqueue(wq);
> +	}
> +#if IS_ENABLED(CONFIG_NFS_LOCALIO)
> +	wq = nfslocaliod_workqueue;
> +	if (wq != NULL) {
> +		nfslocaliod_workqueue = NULL;
> +		destroy_workqueue(wq);
> +	}
> +#endif /* CONFIG_NFS_LOCALIO */
> +}
> +
> +/*
> + * Start the nfsiod workqueues
> + */
> +static int nfsiod_start(void)
> +{
> +	dprintk("RPC:       creating workqueue nfsiod\n");
> +	nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
> +	if (nfsiod_workqueue == NULL)
> +		return -ENOMEM;
> +#if IS_ENABLED(CONFIG_NFS_LOCALIO)
> +	/*
> +	 * localio writes need to use a normal (non-memreclaim) workqueue.
> +	 * When we start getting low on space, XFS goes and calls flush_work() on
> +	 * a non-memreclaim work queue, which causes a priority inversion problem.
> +	 */
> +	dprintk("RPC:       creating workqueue nfslocaliod\n");
> +	nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
> +	if (unlikely(nfslocaliod_workqueue == NULL)) {
> +		nfsiod_stop();
> +		return -ENOMEM;
> +	}
> +#endif /* CONFIG_NFS_LOCALIO */
> +	return 0;
>  }
>  
>  unsigned int nfs_net_id;
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index d352040e3232..9251a357d097 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -440,6 +440,7 @@ int nfs_check_flags(int);
>  
>  /* inode.c */
>  extern struct workqueue_struct *nfsiod_workqueue;
> +extern struct workqueue_struct *nfslocaliod_workqueue;
>  extern struct inode *nfs_alloc_inode(struct super_block *sb);
>  extern void nfs_free_inode(struct inode *);
>  extern int nfs_write_inode(struct inode *, struct writeback_control *);
> diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
> index 724e81716b16..418b8d76692b 100644
> --- a/fs/nfs/localio.c
> +++ b/fs/nfs/localio.c
> @@ -44,6 +44,12 @@ struct nfs_local_fsync_ctx {
>  };
>  static void nfs_local_fsync_work(struct work_struct *work);
>  
> +struct nfs_local_io_args {
> +	struct nfs_local_kiocb *iocb;
> +	struct work_struct work;
> +	struct completion *done;
> +};
> +
>  /*
>   * We need to translate between nfs status return values and
>   * the local errno values which may not be the same.
> @@ -307,30 +313,54 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
>  			status > 0 ? status : 0, hdr->res.eof);
>  }
>  
> -static int
> -nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
> -		const struct rpc_call_ops *call_ops)
> +static void nfs_local_call_read(struct work_struct *work)
>  {
> -	struct nfs_local_kiocb *iocb;
> +	struct nfs_local_io_args *args =
> +		container_of(work, struct nfs_local_io_args, work);
> +	struct nfs_local_kiocb *iocb = args->iocb;
> +	struct file *filp = iocb->kiocb.ki_filp;
>  	struct iov_iter iter;
>  	ssize_t status;
>  
> +	nfs_local_iter_init(&iter, iocb, READ);
> +
> +	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
> +	if (status != -EIOCBQUEUED) {
> +		nfs_local_read_done(iocb, status);
> +		nfs_local_pgio_release(iocb);
> +	}
> +	complete(args->done);
> +}
> +
> +static int nfs_do_local_read(struct nfs_pgio_header *hdr, struct file *filp,
> +			     const struct rpc_call_ops *call_ops)
> +{
> +	struct nfs_local_io_args args;
> +	DECLARE_COMPLETION_ONSTACK(done);
> +	struct nfs_local_kiocb *iocb;
> +
>  	dprintk("%s: vfs_read count=%u pos=%llu\n",
>  		__func__, hdr->args.count, hdr->args.offset);
>  
>  	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_KERNEL);
>  	if (iocb == NULL)
>  		return -ENOMEM;
> -	nfs_local_iter_init(&iter, iocb, READ);
>  
>  	nfs_local_pgio_init(hdr, call_ops);
>  	hdr->res.eof = false;
>  
> -	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
> -	if (status != -EIOCBQUEUED) {
> -		nfs_local_read_done(iocb, status);
> -		nfs_local_pgio_release(iocb);
> -	}
> +	/*
> +	 * Don't call filesystem read() routines directly.
> +	 * In order to avoid issues with stack overflow,
> +	 * call the read routines from a workqueue job.
> +	 */
> +	args.iocb = iocb;
> +	args.done = &done;
> +	INIT_WORK_ONSTACK(&args.work, nfs_local_call_read);
> +	queue_work(nfslocaliod_workqueue, &args.work);
> +	wait_for_completion(&done);
> +	destroy_work_on_stack(&args.work);
> +
>  	return 0;
>  }
>  
> @@ -420,14 +450,35 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
>  	nfs_local_pgio_done(hdr, status);
>  }
>  
> -static int
> -nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
> -		const struct rpc_call_ops *call_ops)
> +static void nfs_local_call_write(struct work_struct *work)
>  {
> -	struct nfs_local_kiocb *iocb;
> +	struct nfs_local_io_args *args =
> +		container_of(work, struct nfs_local_io_args, work);
> +	struct nfs_local_kiocb *iocb = args->iocb;
> +	struct file *filp = iocb->kiocb.ki_filp;
>  	struct iov_iter iter;
>  	ssize_t status;
>  
> +	nfs_local_iter_init(&iter, iocb, WRITE);
> +
> +	file_start_write(filp);
> +	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
> +	file_end_write(filp);
> +	if (status != -EIOCBQUEUED) {
> +		nfs_local_write_done(iocb, status);
> +		nfs_get_vfs_attr(filp, iocb->hdr->res.fattr);
> +		nfs_local_pgio_release(iocb);
> +	}
> +	complete(args->done);
> +}
> +
> +static int nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
> +			      const struct rpc_call_ops *call_ops)
> +{
> +	struct nfs_local_io_args args;
> +	DECLARE_COMPLETION_ONSTACK(done);
> +	struct nfs_local_kiocb *iocb;
> +
>  	dprintk("%s: vfs_write count=%u pos=%llu %s\n",
>  		__func__, hdr->args.count, hdr->args.offset,
>  		(hdr->args.stable == NFS_UNSTABLE) ?  "unstable" : "stable");
> @@ -435,7 +486,6 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
>  	iocb = nfs_local_iocb_alloc(hdr, filp, GFP_NOIO);
>  	if (iocb == NULL)
>  		return -ENOMEM;
> -	nfs_local_iter_init(&iter, iocb, WRITE);
>  
>  	switch (hdr->args.stable) {
>  	default:
> @@ -450,14 +500,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, struct file *filp,
>  
>  	nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
>  
> -	file_start_write(filp);
> -	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
> -	file_end_write(filp);
> -	if (status != -EIOCBQUEUED) {
> -		nfs_local_write_done(iocb, status);
> -		nfs_get_vfs_attr(filp, hdr->res.fattr);
> -		nfs_local_pgio_release(iocb);
> -	}
> +	/*
> +	 * Don't call filesystem write() routines directly.
> +	 * Some filesystem writeback routines can end up taking up a lot of
> +	 * stack (particularly xfs). Instead of risking running over due to
> +	 * the extra overhead from the NFS stack, call these write routines
> +	 * from a workqueue job.
> +	 */
> +	args.iocb = iocb;
> +	args.done = &done;
> +	INIT_WORK_ONSTACK(&args.work, nfs_local_call_write);
> +	queue_work(nfslocaliod_workqueue, &args.work);
> +	wait_for_completion(&done);
> +	destroy_work_on_stack(&args.work);
> +
>  	return 0;
>  }
>  
> -- 
> 2.44.0
> 
> 






[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux