Re: [PATCH] xfs: add a shrinker to background inode reclaim

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is quite ugly compared to the previous version but looks correct
enough to me.  One problem is that the first filesystem registered will
get an far over-proportional number of shrink requests, which the simple
patch to pass private data to the shrinker would get around easily.

Anyway, we need a fix, so:


Reviewed-by: Christoph Hellwig <hch@xxxxxx>

On Wed, Apr 28, 2010 at 02:34:08PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> On low memory boxes or those with highmem, kernel can OOM before the
> background reclaims inodes via xfssyncd. Add a shrinker to run inode
> reclaim so that it inode reclaim is expedited when memory is low.
> 
> This is more complex than it needs to be because the VM folk don't
> want a context added to the shrinker infrastructure. Hence we need
> to add a global list of XFS mount structures so the shrinker can
> traverse them.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/xfs/linux-2.6/xfs_super.c   |    5 ++
>  fs/xfs/linux-2.6/xfs_sync.c    |  112 +++++++++++++++++++++++++++++++++++++---
>  fs/xfs/linux-2.6/xfs_sync.h    |    7 ++-
>  fs/xfs/quota/xfs_qm_syscalls.c |    3 +-
>  fs/xfs/xfs_ag.h                |    1 +
>  fs/xfs/xfs_mount.h             |    1 +
>  6 files changed, 120 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
> index 1abcacc..a43d09e 100644
> --- a/fs/xfs/linux-2.6/xfs_super.c
> +++ b/fs/xfs/linux-2.6/xfs_super.c
> @@ -1210,6 +1210,7 @@ xfs_fs_put_super(
>  
>  	xfs_unmountfs(mp);
>  	xfs_freesb(mp);
> +	xfs_inode_shrinker_unregister(mp);
>  	xfs_icsb_destroy_counters(mp);
>  	xfs_close_devices(mp);
>  	xfs_dmops_put(mp);
> @@ -1623,6 +1624,8 @@ xfs_fs_fill_super(
>  	if (error)
>  		goto fail_vnrele;
>  
> +	xfs_inode_shrinker_register(mp);
> +
>  	kfree(mtpt);
>  	return 0;
>  
> @@ -1868,6 +1871,7 @@ init_xfs_fs(void)
>  		goto out_cleanup_procfs;
>  
>  	vfs_initquota();
> +	xfs_inode_shrinker_init();
>  
>  	error = register_filesystem(&xfs_fs_type);
>  	if (error)
> @@ -1895,6 +1899,7 @@ exit_xfs_fs(void)
>  {
>  	vfs_exitquota();
>  	unregister_filesystem(&xfs_fs_type);
> +	xfs_inode_shrinker_destroy();
>  	xfs_sysctl_unregister();
>  	xfs_cleanup_procfs();
>  	xfs_buf_terminate();
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index 3a64179..3884e20 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
> @@ -95,7 +95,8 @@ xfs_inode_ag_walk(
>  					   struct xfs_perag *pag, int flags),
>  	int			flags,
>  	int			tag,
> -	int			exclusive)
> +	int			exclusive,
> +	int			*nr_to_scan)
>  {
>  	uint32_t		first_index;
>  	int			last_error = 0;
> @@ -134,7 +135,7 @@ restart:
>  		if (error == EFSCORRUPTED)
>  			break;
>  
> -	} while (1);
> +	} while ((*nr_to_scan)--);
>  
>  	if (skipped) {
>  		delay(1);
> @@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
>  					   struct xfs_perag *pag, int flags),
>  	int			flags,
>  	int			tag,
> -	int			exclusive)
> +	int			exclusive,
> +	int			*nr_to_scan)
>  {
>  	int			error = 0;
>  	int			last_error = 0;
>  	xfs_agnumber_t		ag;
> +	int			nr;
>  
> +	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
>  	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
>  		struct xfs_perag	*pag;
>  
> @@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
>  			continue;
>  		}
>  		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
> -						exclusive);
> +						exclusive, &nr);
>  		xfs_perag_put(pag);
>  		if (error) {
>  			last_error = error;
>  			if (error == EFSCORRUPTED)
>  				break;
>  		}
> +		if (nr <= 0)
> +			break;
>  	}
> +	if (nr_to_scan)
> +		*nr_to_scan = nr;
>  	return XFS_ERROR(last_error);
>  }
>  
> @@ -291,7 +299,7 @@ xfs_sync_data(
>  	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
>  
>  	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
> -				      XFS_ICI_NO_TAG, 0);
> +				      XFS_ICI_NO_TAG, 0, NULL);
>  	if (error)
>  		return XFS_ERROR(error);
>  
> @@ -310,7 +318,7 @@ xfs_sync_attr(
>  	ASSERT((flags & ~SYNC_WAIT) == 0);
>  
>  	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
> -				     XFS_ICI_NO_TAG, 0);
> +				     XFS_ICI_NO_TAG, 0, NULL);
>  }
>  
>  STATIC int
> @@ -636,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
>  	radix_tree_tag_set(&pag->pag_ici_root,
>  			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
>  			   XFS_ICI_RECLAIM_TAG);
> +	pag->pag_ici_reclaimable++;
>  }
>  
>  /*
> @@ -668,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
>  {
>  	radix_tree_tag_clear(&pag->pag_ici_root,
>  			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> +	pag->pag_ici_reclaimable--;
>  }
>  
>  /*
> @@ -817,5 +827,93 @@ xfs_reclaim_inodes(
>  	int		mode)
>  {
>  	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
> -					XFS_ICI_RECLAIM_TAG, 1);
> +					XFS_ICI_RECLAIM_TAG, 1, NULL);
> +}
> +
> +/*
> + * Shrinker infrastructure.
> + *
> + * This is all far more complex than it needs to be. It adds a global list of
> + * mounts because the shrinkers can only call a global context. We need to make
> + * the shrinkers pass a context to avoid the need for global state.
> + */
> +static LIST_HEAD(xfs_mount_list);
> +static struct rw_semaphore xfs_mount_list_lock;
> +
> +static int
> +xfs_reclaim_inode_shrink(
> +	int		nr_to_scan,
> +	gfp_t		gfp_mask)
> +{
> +	struct xfs_mount *mp;
> +	struct xfs_perag *pag;
> +	xfs_agnumber_t	ag;
> +	int		reclaimable = 0;
> +
> +	if (nr_to_scan) {
> +		if (!(gfp_mask & __GFP_FS))
> +			return -1;
> +
> +		down_read(&xfs_mount_list_lock);
> +		list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +			xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
> +					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
> +			if (nr_to_scan <= 0)
> +				break;
> +		}
> +		up_read(&xfs_mount_list_lock);
> +	}
> +
> +	down_read(&xfs_mount_list_lock);
> +	list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
> +
> +			pag = xfs_perag_get(mp, ag);
> +			if (!pag->pag_ici_init) {
> +				xfs_perag_put(pag);
> +				continue;
> +			}
> +			reclaimable += pag->pag_ici_reclaimable;
> +			xfs_perag_put(pag);
> +		}
> +	}
> +	up_read(&xfs_mount_list_lock);
> +	return reclaimable;
> +}
> +
> +static struct shrinker xfs_inode_shrinker = {
> +	.shrink = xfs_reclaim_inode_shrink,
> +	.seeks = DEFAULT_SEEKS,
> +};
> +
> +void __init
> +xfs_inode_shrinker_init(void)
> +{
> +	init_rwsem(&xfs_mount_list_lock);
> +	register_shrinker(&xfs_inode_shrinker);
> +}
> +
> +void
> +xfs_inode_shrinker_destroy(void)
> +{
> +	ASSERT(list_empty(&xfs_mount_list));
> +	unregister_shrinker(&xfs_inode_shrinker);
> +}
> +
> +void
> +xfs_inode_shrinker_register(
> +	struct xfs_mount	*mp)
> +{
> +	down_write(&xfs_mount_list_lock);
> +	list_add_tail(&mp->m_mplist, &xfs_mount_list);
> +	up_write(&xfs_mount_list_lock);
> +}
> +
> +void
> +xfs_inode_shrinker_unregister(
> +	struct xfs_mount	*mp)
> +{
> +	down_write(&xfs_mount_list_lock);
> +	list_del(&mp->m_mplist);
> +	up_write(&xfs_mount_list_lock);
>  }
> diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
> index d480c34..cdcbaac 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.h
> +++ b/fs/xfs/linux-2.6/xfs_sync.h
> @@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
>  int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
>  int xfs_inode_ag_iterator(struct xfs_mount *mp,
>  	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
> -	int flags, int tag, int write_lock);
> +	int flags, int tag, int write_lock, int *nr_to_scan);
> +
> +void xfs_inode_shrinker_init(void);
> +void xfs_inode_shrinker_destroy(void);
> +void xfs_inode_shrinker_register(struct xfs_mount *mp);
> +void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
>  
>  #endif
> diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
> index f14408e..26fa431 100644
> --- a/fs/xfs/quota/xfs_qm_syscalls.c
> +++ b/fs/xfs/quota/xfs_qm_syscalls.c
> @@ -889,7 +889,8 @@ xfs_qm_dqrele_all_inodes(
>  	uint		 flags)
>  {
>  	ASSERT(mp->m_quotainfo);
> -	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
> +	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
> +				XFS_ICI_NO_TAG, 0, NULL);
>  }
>  
>  /*------------------------------------------------------------------------*/
> diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
> index b1a5a1f..abb8222 100644
> --- a/fs/xfs/xfs_ag.h
> +++ b/fs/xfs/xfs_ag.h
> @@ -223,6 +223,7 @@ typedef struct xfs_perag {
>  	int		pag_ici_init;	/* incore inode cache initialised */
>  	rwlock_t	pag_ici_lock;	/* incore inode lock */
>  	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
> +	int		pag_ici_reclaimable;	/* reclaimable inodes */
>  #endif
>  	int		pagb_count;	/* pagb slots in use */
>  	xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS];	/* unstable blocks */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 4fa0bc7..9ff48a1 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -259,6 +259,7 @@ typedef struct xfs_mount {
>  	wait_queue_head_t	m_wait_single_sync_task;
>  	__int64_t		m_update_flags;	/* sb flags we need to update
>  						   on the next remount,rw */
> +	struct list_head	m_mplist;	/* inode shrinker mount list */
>  } xfs_mount_t;
>  
>  /*
> -- 
> 1.5.6.5
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs
---end quoted text---

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]