Re: [PATCH 1/5] xfs: separate read-only variables in struct xfs_mount

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, May 12, 2020 at 08:30:27AM -0400, Brian Foster wrote:
> On Tue, May 12, 2020 at 07:28:07PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@xxxxxxxxxx>
> > 
> > Seeing massive cpu usage from xfs_agino_range() on one machine;
> > instruction level profiles look similar to another machine running
> > the same workload, only one machien is consuming 10x as much CPU as
> > the other and going much slower. The only real difference between
> > the two machines is core count per socket. Both are running
> > identical 16p/16GB virtual machine configurations
> > 
> ...
> > 
> > It's an improvement, hence indicating that separation and further
> > optimisation of read-only global filesystem data is worthwhile, but
> > it clearly isn't the underlying issue causing this specific
> > performance degradation.
> > 
> > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> > ---
> 
> Pretty neat improvement. Could you share your test script that generated
> the above? I have a 80 CPU box I'd be interested to give this a whirl
> on...
> 
> >  fs/xfs/xfs_mount.h | 50 +++++++++++++++++++++++++++-------------------
> >  1 file changed, 29 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index aba5a15792792..712b3e2583316 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -88,21 +88,12 @@ typedef struct xfs_mount {
> >  	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
> >  	char			*m_rtname;	/* realtime device name */
> >  	char			*m_logname;	/* external log device name */
> > -	int			m_bsize;	/* fs logical block size */
> >  	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
> >  	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
> >  	spinlock_t		m_agirotor_lock;/* .. and lock protecting it */
> > -	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> > -	uint			m_allocsize_log;/* min write size log bytes */
> > -	uint			m_allocsize_blocks; /* min write size blocks */
> >  	struct xfs_da_geometry	*m_dir_geo;	/* directory block geometry */
> >  	struct xfs_da_geometry	*m_attr_geo;	/* attribute block geometry */
> >  	struct xlog		*m_log;		/* log specific stuff */
> > -	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
> > -	int			m_logbufs;	/* number of log buffers */
> > -	int			m_logbsize;	/* size of each log buffer */
> > -	uint			m_rsumlevels;	/* rt summary levels */
> > -	uint			m_rsumsize;	/* size of rt summary, bytes */
> >  	/*
> >  	 * Optional cache of rt summary level per bitmap block with the
> >  	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
> > @@ -117,9 +108,15 @@ typedef struct xfs_mount {
> >  	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
> >  	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
> >  	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
> > +
> > +	/*
> > +	 * Read-only variables that are pre-calculated at mount time.
> > +	 */
> 
> The intent here is to align the entire section below, right? If so, the
> connection with the cache line alignment is a bit tenuous. Could we
> tweak and/or add a sentence to the comment to be more explicit? I.e.:
> 
> 	/*
> 	 * Align the following pre-calculated fields to a cache line to
> 	 * prevent cache line bouncing between frequently read and
> 	 * frequently written fields.
> 	 */

I kinda wish we laid out via comments each place we cross a 64b boundary
on a 64-bit CPU, but I guess seeing as some of these structures can
change size depending on config option and kernel version that's
probably just asking for confusion and madness.

Reviewed-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>

--D


> > +	int ____cacheline_aligned m_bsize;	/* fs logical block size */
> >  	uint8_t			m_blkbit_log;	/* blocklog + NBBY */
> >  	uint8_t			m_blkbb_log;	/* blocklog - BBSHIFT */
> >  	uint8_t			m_agno_log;	/* log #ag's */
> > +	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
> >  	uint			m_blockmask;	/* sb_blocksize-1 */
> >  	uint			m_blockwsize;	/* sb_blocksize in words */
> >  	uint			m_blockwmask;	/* blockwsize-1 */
> > @@ -138,20 +135,35 @@ typedef struct xfs_mount {
> >  	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
> >  	uint			m_alloc_set_aside; /* space we can't use */
> >  	uint			m_ag_max_usable; /* max space per AG */
> > -	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
> > -	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
> > -	struct mutex		m_growlock;	/* growfs mutex */
> > +	int			m_dalign;	/* stripe unit */
> > +	int			m_swidth;	/* stripe width */
> > +	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
> > +	uint			m_allocsize_log;/* min write size log bytes */
> > +	uint			m_allocsize_blocks; /* min write size blocks */
> > +	int			m_logbufs;	/* number of log buffers */
> > +	int			m_logbsize;	/* size of each log buffer */
> > +	uint			m_rsumlevels;	/* rt summary levels */
> > +	uint			m_rsumsize;	/* size of rt summary, bytes */
> >  	int			m_fixedfsid[2];	/* unchanged for life of FS */
> > -	uint64_t		m_flags;	/* global mount flags */
> > -	bool			m_finobt_nores; /* no per-AG finobt resv. */
> >  	uint			m_qflags;	/* quota status flags */
> > +	uint64_t		m_flags;	/* global mount flags */
> > +	int64_t			m_low_space[XFS_LOWSP_MAX];
> > +	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
> >  	struct xfs_trans_resv	m_resv;		/* precomputed res values */
> > +						/* low free space thresholds */
> > +	bool			m_always_cow;
> > +	bool			m_fail_unmount;
> > +	bool			m_finobt_nores; /* no per-AG finobt resv. */
> > +	/*
> > +	 * End of pre-calculated read-only variables
> > +	 */
> 
> m_always_cow and m_fail_unmount are mutable via sysfs knobs so
> technically not read-only. I'm assuming that has no practical impact on
> the performance optimization, but it might be worth leaving them where
> they are to avoid confusion.
> 
> With those nits fixed:
> 
> Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx>
> 
> > +
> > +	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
> > +	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
> > +	struct mutex		m_growlock;	/* growfs mutex */
> >  	uint64_t		m_resblks;	/* total reserved blocks */
> >  	uint64_t		m_resblks_avail;/* available reserved blocks */
> >  	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
> > -	int			m_dalign;	/* stripe unit */
> > -	int			m_swidth;	/* stripe width */
> > -	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
> >  	atomic_t		m_active_trans;	/* number trans frozen */
> >  	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
> >  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> > @@ -160,8 +172,6 @@ typedef struct xfs_mount {
> >  	struct delayed_work	m_cowblocks_work; /* background cow blocks
> >  						     trimming */
> >  	bool			m_update_sb;	/* sb needs update in mount */
> > -	int64_t			m_low_space[XFS_LOWSP_MAX];
> > -						/* low free space thresholds */
> >  	struct xfs_kobj		m_kobj;
> >  	struct xfs_kobj		m_error_kobj;
> >  	struct xfs_kobj		m_error_meta_kobj;
> > @@ -191,8 +201,6 @@ typedef struct xfs_mount {
> >  	 */
> >  	uint32_t		m_generation;
> >  
> > -	bool			m_always_cow;
> > -	bool			m_fail_unmount;
> >  #ifdef DEBUG
> >  	/*
> >  	 * Frequency with which errors are injected.  Replaces xfs_etest; the
> > -- 
> > 2.26.1.301.g55bc3eb7cb9
> > 
> 



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux