Re: [RFCv3 3/3] iomap: Support subpage size dirty tracking to improve write performance

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Feb 27, 2023 at 01:13:32AM +0530, Ritesh Harjani (IBM) wrote:
> On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> filesystem blocksize, this patch should improve the performance by doing
> only the subpage dirty data write.
> 
> This should also reduce the write amplification since we can now track
> subpage dirty status within state bitmaps. Earlier we had to
> write the entire 64k page even if only a part of it (e.g. 4k) was
> updated.
> 
> Performance testing of below fio workload reveals ~16x performance
> improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> 
> 1. <test_randwrite.fio>
> [global]
> 	ioengine=psync
> 	rw=randwrite
> 	overwrite=1
> 	pre_read=1
> 	direct=0
> 	bs=4k
> 	size=1G
> 	dir=./
> 	numjobs=8
> 	fdatasync=1
> 	runtime=60
> 	iodepth=64
> 	group_reporting=1
> 
> [fio-run]
> 
> 2. Also our internal performance team reported that this patch improves there
>    database workload performance by around ~83% (with XFS on Power)
> 
> Reported-by: Aravinda Herle <araherle@xxxxxxxxxx>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@xxxxxxxxx>
> ---
>  fs/gfs2/aops.c         |   2 +-
>  fs/iomap/buffered-io.c | 104 +++++++++++++++++++++++++++++++++++++----
>  fs/xfs/xfs_aops.c      |   2 +-
>  fs/zonefs/super.c      |   2 +-
>  include/linux/iomap.h  |   1 +
>  5 files changed, 99 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index e782b4f1d104..b9c35288a5eb 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -741,7 +741,7 @@ static const struct address_space_operations gfs2_aops = {
>  	.writepages = gfs2_writepages,
>  	.read_folio = gfs2_read_folio,
>  	.readahead = gfs2_readahead,
> -	.dirty_folio = filemap_dirty_folio,
> +	.dirty_folio = iomap_dirty_folio,
>  	.release_folio = iomap_release_folio,
>  	.invalidate_folio = iomap_invalidate_folio,
>  	.bmap = gfs2_bmap,
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index e0b0be16278e..fb55183c547f 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -44,8 +44,8 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
>  static struct bio_set iomap_ioend_bioset;
> 
>  /*
> - * Accessor functions for setting/clearing/checking uptodate bits in
> - * iop->state bitmap.
> + * Accessor functions for setting/clearing/checking uptodate and
> + * dirty bits in iop->state bitmap.
>   * nrblocks is i_blocks_per_folio() which is passed in every
>   * function as the last argument for API consistency.
>   */
> @@ -75,8 +75,29 @@ static inline bool iop_full_uptodate(struct iomap_page *iop,
>  	return bitmap_full(iop->state, nrblocks);
>  }
> 
> +static inline void iop_set_range_dirty(struct iomap_page *iop,
> +				unsigned int start, unsigned int len,
> +				unsigned int nrblocks)
> +{
> +	bitmap_set(iop->state, start + nrblocks, len);
> +}
> +
> +static inline void iop_clear_range_dirty(struct iomap_page *iop,
> +				unsigned int start, unsigned int len,
> +				unsigned int nrblocks)
> +{
> +	bitmap_clear(iop->state, start + nrblocks, len);
> +}
> +
> +static inline bool iop_test_dirty(struct iomap_page *iop, unsigned int pos,
> +			     unsigned int nrblocks)
> +{
> +	return test_bit(pos + nrblocks, iop->state);
> +}
> +
>  static struct iomap_page *
> -iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
> +iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags,
> +		  bool is_dirty)
>  {
>  	struct iomap_page *iop = to_iomap_page(folio);
>  	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> @@ -90,12 +111,18 @@ iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
>  	else
>  		gfp = GFP_NOFS | __GFP_NOFAIL;
> 
> -	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
> +	/*
> +	 * iop->state tracks 2 types of bitmaps i.e. uptodate & dirty
> +	 * for bs < ps.
> +	 */

PLease write comments out in full. Also "bs < ps" is actually
wrong because we have large folios in the page cache and they will
need to use sub-folio state tracking if they are dirtied.

	/*
	 * iop->state tracks two sets of state flags when the
	 * filesystem block size is smaller than the folio size.
	 * state. The first tracks per-filesystem block uptodate
	 * state, the second tracks per-filesystem block dirty
	 * state.
	 */

> +	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
>  		      gfp);
>  	if (iop) {
>  		spin_lock_init(&iop->state_lock);
>  		if (folio_test_uptodate(folio))
>  			iop_set_range_uptodate(iop, 0, nr_blocks, nr_blocks);
> +		if (is_dirty)
> +			iop_set_range_dirty(iop, 0, nr_blocks, nr_blocks);
>  		folio_attach_private(folio, iop);
>  	}
>  	return iop;
> @@ -202,6 +229,48 @@ static void iomap_set_range_uptodate(struct folio *folio,
>  		folio_mark_uptodate(folio);
>  }
> 
> +static void iomap_iop_set_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> +	unsigned first = (off >> inode->i_blkbits);
> +	unsigned last = ((off + len - 1) >> inode->i_blkbits);

first_bit, last_bit if we are leaving this code unchanged.


> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	iop_set_range_dirty(iop, first, last - first + 1, nr_blocks);
                                        ^^^^^^^^^^^^^^^^ nr_bits

I dislike all the magic "- 1" and "+ 1" sprinkles that end up in
this code because of the closed ranges nomenclature infecting this
code. If we use closed start/open ended ranges like we do all
through XFS, we have:

offset_to_start_bit()
{
	return off >> bits;			// round_down
}

offset_to_end_bit()
{
	return (off + (1 << bits) - 1) >> bits; // round_up
}

	unsigned start_bit = offset_to_start_bit(off, inode->i_blkbits);
	unsigned end_bit = offset_to_end_bit(off + len, inode->i_blkbits);
	unsigned nr_bits = end_bit - start_bit;

	iop_set_range_dirty(iop, start_bit, nr_bits, nr_blocks);

And we don't have to add magic "handle the off by one"
sprinkles everywhere that maths on closed ranges requires to get
correct...


> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_set_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	if (iop)
> +		iomap_iop_set_range_dirty(folio, iop, off, len);
> +}
> +
> +static void iomap_iop_clear_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> +	unsigned first = (off >> inode->i_blkbits);
> +	unsigned last = ((off + len - 1) >> inode->i_blkbits);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	iop_clear_range_dirty(iop, first, last - first + 1, nr_blocks);
> +	spin_unlock_irqrestore(&iop->state_lock, flags);

Same here with the magic off by one sprinkles.

FWIW, this is exactly the same code as
iomap_iop_clear_range_uptodate(), is it not? The only difference is
the offset into the state bitmap and that is done by
iop_clear_range_dirty() vs iop_clear_range_uptodate()? Seems like a
layer of wrappers could be taken out of here simply by placing the
start offset correctly here for a common iop_clear_range() helper...

> +static void iomap_clear_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	if (iop)
> +		iomap_iop_clear_range_dirty(folio, iop, off, len);
> +}

Or even here at this layer, and we squash out two layers of
very similar wrappers.

-Dave.
-- 
Dave Chinner
david@xxxxxxxxxxxxx



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux