Re: [PATCH 05/10] xfs: increase the default parallelism levels of pwork clients

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Wed, 20 Jan 2021 15:34:39 -0800

On Mon, Jan 18, 2021 at 02:13:28PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@xxxxxxxxxx>
> 
> Increase the default parallelism level for pwork clients so that we can
> take advantage of computers with a lot of CPUs and a lot of hardware.
> The posteof/cowblocks cleanup series will use the functionality
> presented in this patch to constrain the number of background per-ag gc
> threads to our best estimate of the amount of parallelism that the
> filesystem can sustain.

Dave and I had a conversation on IRC about this trying to figure out the
"desired" parallelism of filesystem threads in which he pointed out that
we can't easily figure out the real iops capacity of the storage --
devices don't report "nonrotational" status correctly, hardware raid
with caches and nvram don't neatly fit our "spinning floppy" vs "ssd"
dichotomy, and in the end the device concurrency is the queue depth of
the underlying hardware.  For really fast storage, even one io submitter
per cpu isn't enough to saturate the device.

So rather than try to figure out the magic optimal number and introduce
override knobs, XFS should not articially limit its concurrency levels.
Wherever it is that the bottlenecks lie (data device, log device, log
grants, locks) we might as well schedule work items as they come so that
we can see all the hot spots as they arise.

I will rework this patch to remove the artificial limits, convert the
quotacheck pworker to be unbound (since we actually /want/ the workers
to spread out among the CPUs and we don't care about bounding), and add
WQ_SYSFS to all the workqueues so we can examine them in
/sys/bus/workqueue/devices.

--D

> 
> Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
> ---
>  fs/xfs/xfs_iwalk.c |    2 +
>  fs/xfs/xfs_pwork.c |   80 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  fs/xfs/xfs_pwork.h |    3 +-
>  3 files changed, 76 insertions(+), 9 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> index eae3aff9bc97..bb31ef870cdc 100644
> --- a/fs/xfs/xfs_iwalk.c
> +++ b/fs/xfs/xfs_iwalk.c
> @@ -624,7 +624,7 @@ xfs_iwalk_threaded(
>  	ASSERT(agno < mp->m_sb.sb_agcount);
>  	ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
>  
> -	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
> +	nr_threads = xfs_pwork_guess_workqueue_threads(mp);
>  	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
>  			nr_threads);
>  	if (error)
> diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
> index b03333f1c84a..53606397ff54 100644
> --- a/fs/xfs/xfs_pwork.c
> +++ b/fs/xfs/xfs_pwork.c
> @@ -118,19 +118,85 @@ xfs_pwork_poll(
>  		touch_softlockup_watchdog();
>  }
>  
> +/* Estimate the amount of parallelism available for a storage device. */
> +static unsigned int
> +xfs_guess_buftarg_parallelism(
> +	struct xfs_buftarg	*btp)
> +{
> +	int			iomin;
> +	int			ioopt;
> +
> +	/*
> +	 * The device tells us that it is non-rotational, and we take that to
> +	 * mean there are no moving parts and that the device can handle all
> +	 * the CPUs throwing IO requests at it.
> +	 */
> +	if (blk_queue_nonrot(btp->bt_bdev->bd_disk->queue))
> +		return num_online_cpus();
> +
> +	/*
> +	 * The device has a preferred and minimum IO size that suggest a RAID
> +	 * setup, so infer the number of disks and assume that the parallelism
> +	 * is equal to the disk count.
> +	 */
> +	iomin = bdev_io_min(btp->bt_bdev);
> +	ioopt = bdev_io_opt(btp->bt_bdev);
> +	if (iomin > 0 && ioopt > iomin)
> +		return ioopt / iomin;
> +
> +	/*
> +	 * The device did not indicate that it has any capabilities beyond that
> +	 * of a rotating disk with a single drive head, so we estimate no
> +	 * parallelism at all.
> +	 */
> +	return 1;
> +}
> +
>  /*
> - * Return the amount of parallelism that the data device can handle, or 0 for
> - * no limit.
> + * Estimate the amount of parallelism that is available for metadata operations
> + * on this filesystem.
>   */
>  unsigned int
> -xfs_pwork_guess_datadev_parallelism(
> +xfs_pwork_guess_metadata_threads(
>  	struct xfs_mount	*mp)
>  {
> -	struct xfs_buftarg	*btp = mp->m_ddev_targp;
> +	unsigned int		threads;
>  
>  	/*
> -	 * For now we'll go with the most conservative setting possible,
> -	 * which is two threads for an SSD and 1 thread everywhere else.
> +	 * Estimate the amount of parallelism for metadata operations from the
> +	 * least capable of the two devices that handle metadata.  Cap that
> +	 * estimate to the number of AGs to avoid unnecessary lock contention.
>  	 */
> -	return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1;
> +	threads = xfs_guess_buftarg_parallelism(mp->m_ddev_targp);
> +	if (mp->m_logdev_targp != mp->m_ddev_targp)
> +		threads = min(xfs_guess_buftarg_parallelism(mp->m_logdev_targp),
> +			      threads);
> +	threads = min(mp->m_sb.sb_agcount, threads);
> +
> +	/* If the storage told us it has fancy capabilities, we're done. */
> +	if (threads > 1)
> +		goto clamp;
> +
> +	/*
> +	 * Metadata storage did not even hint that it has any parallel
> +	 * capability.  If the filesystem was formatted with a stripe unit and
> +	 * width, we'll treat that as evidence of a RAID setup and estimate
> +	 * the number of disks.
> +	 */
> +	if (mp->m_sb.sb_unit > 0 && mp->m_sb.sb_width > mp->m_sb.sb_unit)
> +		threads = mp->m_sb.sb_width / mp->m_sb.sb_unit;
> +
> +clamp:
> +	/* Don't return an estimate larger than the CPU count. */
> +	return min(num_online_cpus(), threads);
> +}
> +
> +/* Estimate how many threads we need for a parallel work queue. */
> +unsigned int
> +xfs_pwork_guess_workqueue_threads(
> +	struct xfs_mount	*mp)
> +{
> +	/* pwork queues are not unbounded, so we have to abide WQ_MAX_ACTIVE. */
> +	return min_t(unsigned int, xfs_pwork_guess_metadata_threads(mp),
> +			WQ_MAX_ACTIVE);
>  }
> diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
> index 8133124cf3bb..6320bca9c554 100644
> --- a/fs/xfs/xfs_pwork.h
> +++ b/fs/xfs/xfs_pwork.h
> @@ -56,6 +56,7 @@ int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
>  void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
>  int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
>  void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
> -unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
> +unsigned int xfs_pwork_guess_metadata_threads(struct xfs_mount *mp);
> +unsigned int xfs_pwork_guess_workqueue_threads(struct xfs_mount *mp);
>  
>  #endif /* __XFS_PWORK_H__ */
>