Re: [RFC PATCH 1/8] fs: introduce get_shared_files() for dax&reflink

"Darrick J. Wong" <darrick.wong@xxxxxxxxxx> · Fri, 7 Aug 2020 09:15:26 -0700

On Fri, Aug 07, 2020 at 09:13:29PM +0800, Shiyang Ruan wrote:
> Under the mode of both dax and reflink on, one page may be shared by
> multiple files and offsets.  In order to track them in memory-failure or
> other cases, we introduce this function by finding out who is sharing
> this block(the page) in a filesystem.  It returns a list that contains
> all the owners, and the offset in each owner.
> 
> For XFS, rmapbt is used to find out the owners of one block.  So, it
> should be turned on when we want to use dax&reflink feature together.
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxxxxx>
> ---
>  fs/xfs/xfs_super.c  | 67 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/dax.h |  7 +++++
>  include/linux/fs.h  |  2 ++
>  3 files changed, 76 insertions(+)
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 379cbff438bc..b71392219c91 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -35,6 +35,9 @@
>  #include "xfs_refcount_item.h"
>  #include "xfs_bmap_item.h"
>  #include "xfs_reflink.h"
> +#include "xfs_alloc.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
>  
>  #include <linux/magic.h>
>  #include <linux/fs_context.h>
> @@ -1097,6 +1100,69 @@ xfs_fs_free_cached_objects(
>  	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
>  }
>  
> +static int _get_shared_files_fn(

Needs an xfs_ prefix...

> +	struct xfs_btree_cur	*cur,
> +	struct xfs_rmap_irec	*rec,
> +	void			*priv)
> +{
> +	struct list_head	*list = priv;
> +	struct xfs_inode	*ip;
> +	struct shared_files	*sfp;
> +
> +	/* Get files that incore, filter out others that are not in use. */
> +	xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, 0, &ip);

No error checking at all?

What if rm_owner refers to metadata?

> +	if (ip && !ip->i_vnode.i_mapping)
> +		return 0;

When is the xfs_inode released?  We don't iput it here, and there's no
way for dax_unlock_page (afaict the only consumer) to do it, so we
leak the reference.

> +
> +	sfp = kmalloc(sizeof(*sfp), GFP_KERNEL);

If there are millions of open files reflinked to this range of pmem this
is going to allocate a /lot/ of memory.

> +	sfp->mapping = ip->i_vnode.i_mapping;

sfp->mapping = VFS_I(ip)->i_mapping;

> +	sfp->index = rec->rm_offset;
> +	list_add_tail(&sfp->list, list);

Why do we leave ->cookie uninitialized?  What does it even do?

> +
> +	return 0;
> +}
> +
> +static int
> +xfs_fs_get_shared_files(
> +	struct super_block	*sb,
> +	pgoff_t			offset,

Which device does this offset refer to?  XFS supports multiple storage
devices.

Also, uh, is this really a pgoff_t?  If yes, you can't use it with
XFS_B_TO_FSB below without first converting it to a loff_t.

> +	struct list_head	*list)
> +{
> +	struct xfs_mount	*mp = XFS_M(sb);
> +	struct xfs_trans	*tp = NULL;
> +	struct xfs_btree_cur	*cur = NULL;
> +	struct xfs_rmap_irec	rmap_low = { 0 }, rmap_high = { 0 };

No need to memset(0) rmap_low later, or zero rmap_high just to memset it
later.

> +	struct xfs_buf		*agf_bp = NULL;
> +	xfs_agblock_t		bno = XFS_B_TO_FSB(mp, offset);

"FSB" refers to xfs_fsblock_t.  You just ripped the upper 32 bits off
the fsblock number.

> +	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, bno);
> +	int			error = 0;
> +
> +	error = xfs_trans_alloc_empty(mp, &tp);
> +	if (error)
> +		return error;
> +
> +	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +	if (error)
> +		return error;
> +
> +	cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);
> +
> +	memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));

Not necessary, bc_rec is zero in a freshly created cursor.

> +	/* Construct the range for one rmap search */
> +	memset(&rmap_low, 0, sizeof(rmap_low));
> +	memset(&rmap_high, 0xFF, sizeof(rmap_high));
> +	rmap_low.rm_startblock = rmap_high.rm_startblock = bno;
> +
> +	error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
> +				     _get_shared_files_fn, list);
> +	if (error == -ECANCELED)
> +		error = 0;
> +
> +	xfs_btree_del_cursor(cur, error);
> +	xfs_trans_brelse(tp, agf_bp);
> +	return error;
> +}

Looking at this, I don't think this is the right way to approach memory
poisoning.  Rather than allocating a (potentially huge) linked list and
passing it to the memory poison code to unmap pages, kill processes, and
free the list, I think:

1) "->get_shared_files" should be more targetted.  Call it ->storage_lost
or something, so that it only has one purpose, which is to react to
asynchronous notifications that storage has been lost.

2) The inner _get_shared_files_fn should directly call back into the
memory manager to remove a poisoned page from the mapping and signal
whatever process might have it mapped.

That way, _get_shared_files_fn can look in the xfs buffer cache to see
if we have a copy in DRAM, and immediately write it back to pmem.

Hmm and now that you've gotten me rambling about hwpoison, I wonder what
happens if dram backing part of the xfs buffer cache goes bad...

--D

> +
>  static const struct super_operations xfs_super_operations = {
>  	.alloc_inode		= xfs_fs_alloc_inode,
>  	.destroy_inode		= xfs_fs_destroy_inode,
> @@ -1110,6 +1176,7 @@ static const struct super_operations xfs_super_operations = {
>  	.show_options		= xfs_fs_show_options,
>  	.nr_cached_objects	= xfs_fs_nr_cached_objects,
>  	.free_cached_objects	= xfs_fs_free_cached_objects,
> +	.get_shared_files	= xfs_fs_get_shared_files,
>  };
>  
>  static int
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 6904d4e0b2e0..0a85e321d6b4 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -40,6 +40,13 @@ struct dax_operations {
>  
>  extern struct attribute_group dax_attribute_group;
>  
> +struct shared_files {
> +	struct list_head	list;
> +	struct address_space	*mapping;
> +	pgoff_t			index;
> +	dax_entry_t		cookie;
> +};
> +
>  #if IS_ENABLED(CONFIG_DAX)
>  struct dax_device *dax_get_by_host(const char *host);
>  struct dax_device *alloc_dax(void *private, const char *host,
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f5abba86107d..81de3d2739b9 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1977,6 +1977,8 @@ struct super_operations {
>  				  struct shrink_control *);
>  	long (*free_cached_objects)(struct super_block *,
>  				    struct shrink_control *);
> +	int (*get_shared_files)(struct super_block *sb, pgoff_t offset,
> +				struct list_head *list);
>  };
>  
>  /*
> -- 
> 2.27.0
> 
> 
>