On Tue, Apr 19, 2022 at 12:50:44PM +0800, Shiyang Ruan wrote: > Introduce xfs_notify_failure.c to handle failure related works, such as > implement ->notify_failure(), register/unregister dax holder in xfs, and > so on. > > If the rmap feature of XFS enabled, we can query it to find files and > metadata which are associated with the corrupt data. For now all we do > is kill processes with that file mapped into their address spaces, but > future patches could actually do something about corrupt metadata. > > After that, the memory failure needs to notify the processes who are > using those files. > > Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx> > Reviewed-by: Christoph Hellwig <hch@xxxxxx> > --- > fs/xfs/Makefile | 5 + > fs/xfs/xfs_buf.c | 11 +- > fs/xfs/xfs_fsops.c | 3 + > fs/xfs/xfs_mount.h | 1 + > fs/xfs/xfs_notify_failure.c | 220 ++++++++++++++++++++++++++++++++++++ > fs/xfs/xfs_super.h | 1 + > 6 files changed, 238 insertions(+), 3 deletions(-) > create mode 100644 fs/xfs/xfs_notify_failure.c > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > index 04611a1068b4..09f5560e29f2 100644 > --- a/fs/xfs/Makefile > +++ b/fs/xfs/Makefile > @@ -128,6 +128,11 @@ xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o > xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o > xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o > > +# notify failure > +ifeq ($(CONFIG_MEMORY_FAILURE),y) > +xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o > +endif > + > # online scrub/repair > ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index f9ca08398d32..084455f7e2ff 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -5,6 +5,7 @@ > */ > #include "xfs.h" > #include <linux/backing-dev.h> > +#include <linux/dax.h> > > #include "xfs_shared.h" > #include "xfs_format.h" > @@ -1911,7 +1912,7 @@ xfs_free_buftarg( > list_lru_destroy(&btp->bt_lru); > > blkdev_issue_flush(btp->bt_bdev); > - fs_put_dax(btp->bt_daxdev, NULL); > + fs_put_dax(btp->bt_daxdev, btp->bt_mount); > > kmem_free(btp); > } > @@ -1958,14 +1959,18 @@ xfs_alloc_buftarg( > struct block_device *bdev) > { > xfs_buftarg_t *btp; > + const struct dax_holder_operations *ops = NULL; > > +#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) > + ops = &xfs_dax_holder_operations; > +#endif > btp = kmem_zalloc(sizeof(*btp), KM_NOFS); > > btp->bt_mount = mp; > btp->bt_dev = bdev->bd_dev; > btp->bt_bdev = bdev; > - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL, > - NULL); > + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, > + mp, ops); > > /* > * Buffer IO error rate limiting. Limit it to no more than 10 messages > diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c > index 68f74549fa22..56530900bb86 100644 > --- a/fs/xfs/xfs_fsops.c > +++ b/fs/xfs/xfs_fsops.c > @@ -536,6 +536,9 @@ xfs_do_force_shutdown( > } else if (flags & SHUTDOWN_CORRUPT_INCORE) { > tag = XFS_PTAG_SHUTDOWN_CORRUPT; > why = "Corruption of in-memory data"; > + } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { > + tag = XFS_PTAG_SHUTDOWN_CORRUPT; > + why = "Corruption of on-disk metadata"; > } else { > tag = XFS_PTAG_SHUTDOWN_IOERROR; > why = "Metadata I/O Error"; > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index f6dc19de8322..9237cc159542 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -435,6 +435,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, > #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ > #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ > #define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ > +#define SHUTDOWN_CORRUPT_ONDISK 0x0010 /* corrupt metadata on device */ > > #define XFS_SHUTDOWN_STRINGS \ > { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ > diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c > new file mode 100644 > index 000000000000..0702a402688a > --- /dev/null > +++ b/fs/xfs/xfs_notify_failure.c > @@ -0,0 +1,220 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (c) 2022 Fujitsu. All Rights Reserved. > + */ > + > +#include "xfs.h" > +#include "xfs_shared.h" > +#include "xfs_format.h" > +#include "xfs_log_format.h" > +#include "xfs_trans_resv.h" > +#include "xfs_mount.h" > +#include "xfs_alloc.h" > +#include "xfs_bit.h" > +#include "xfs_btree.h" > +#include "xfs_inode.h" > +#include "xfs_icache.h" > +#include "xfs_rmap.h" > +#include "xfs_rmap_btree.h" > +#include "xfs_rtalloc.h" > +#include "xfs_trans.h" > + > +#include <linux/mm.h> > +#include <linux/dax.h> > + > +struct failure_info { > + xfs_agblock_t startblock; > + xfs_extlen_t blockcount; > + int mf_flags; > +}; > + > +static pgoff_t > +xfs_failure_pgoff( > + struct xfs_mount *mp, > + const struct xfs_rmap_irec *rec, > + const struct failure_info *notify) > +{ > + uint64_t pos = rec->rm_offset; > + > + if (notify->startblock > rec->rm_startblock) > + pos += XFS_FSB_TO_B(mp, > + notify->startblock - rec->rm_startblock); > + return pos >> PAGE_SHIFT; I don't think the unit conversion here is correct. rec->rm_offset is xfs_fileoff_t (aka a logical file block). The if statement body adds a quantity that is in units of bytes, which is incorrect. The return statement looks like a bytes-to-pgoff conversion, which doesn't apply to a xfs_fileoff_t quantity. The function *would* make sense if the first line of the function were: loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); Everything else in this patch finally looks good though. --D > +} > + > +static unsigned long > +xfs_failure_pgcnt( > + struct xfs_mount *mp, > + const struct xfs_rmap_irec *rec, > + const struct failure_info *notify) > +{ > + xfs_agblock_t end_rec; > + xfs_agblock_t end_notify; > + xfs_agblock_t start_cross; > + xfs_agblock_t end_cross; > + > + start_cross = max(rec->rm_startblock, notify->startblock); > + > + end_rec = rec->rm_startblock + rec->rm_blockcount; > + end_notify = notify->startblock + notify->blockcount; > + end_cross = min(end_rec, end_notify); > + > + return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; > +} > + > +static int > +xfs_dax_failure_fn( > + struct xfs_btree_cur *cur, > + const struct xfs_rmap_irec *rec, > + void *data) > +{ > + struct xfs_mount *mp = cur->bc_mp; > + struct xfs_inode *ip; > + struct failure_info *notify = data; > + int error = 0; > + > + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || > + (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { > + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); > + return -EFSCORRUPTED; > + } > + > + /* Get files that incore, filter out others that are not in use. */ > + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, > + 0, &ip); > + /* Continue the rmap query if the inode isn't incore */ > + if (error == -ENODATA) > + return 0; > + if (error) > + return error; > + > + error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, > + xfs_failure_pgoff(mp, rec, notify), > + xfs_failure_pgcnt(mp, rec, notify), > + notify->mf_flags); > + xfs_irele(ip); > + return error; > +} > + > +static int > +xfs_dax_notify_ddev_failure( > + struct xfs_mount *mp, > + xfs_daddr_t daddr, > + xfs_daddr_t bblen, > + int mf_flags) > +{ > + struct xfs_trans *tp = NULL; > + struct xfs_btree_cur *cur = NULL; > + struct xfs_buf *agf_bp = NULL; > + int error = 0; > + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); > + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); > + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); > + xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); > + > + error = xfs_trans_alloc_empty(mp, &tp); > + if (error) > + return error; > + > + for (; agno <= end_agno; agno++) { > + struct xfs_rmap_irec ri_low = { }; > + struct xfs_rmap_irec ri_high; > + struct failure_info notify; > + struct xfs_agf *agf; > + xfs_agblock_t agend; > + > + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp); > + if (error) > + break; > + > + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag); > + > + /* > + * Set the rmap range from ri_low to ri_high, which represents > + * a [start, end] where we looking for the files or metadata. > + */ > + memset(&ri_high, 0xFF, sizeof(ri_high)); > + ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); > + if (agno == end_agno) > + ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); > + > + agf = agf_bp->b_addr; > + agend = min(be32_to_cpu(agf->agf_length), > + ri_high.rm_startblock); > + notify.startblock = ri_low.rm_startblock; > + notify.blockcount = agend - ri_low.rm_startblock; > + > + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, > + xfs_dax_failure_fn, ¬ify); > + xfs_btree_del_cursor(cur, error); > + xfs_trans_brelse(tp, agf_bp); > + if (error) > + break; > + > + fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); > + } > + > + xfs_trans_cancel(tp); > + return error; > +} > + > +static int > +xfs_dax_notify_failure( > + struct dax_device *dax_dev, > + u64 offset, > + u64 len, > + int mf_flags) > +{ > + struct xfs_mount *mp = dax_holder(dax_dev); > + u64 ddev_start; > + u64 ddev_end; > + > + if (!(mp->m_sb.sb_flags & SB_BORN)) { > + xfs_warn(mp, "filesystem is not ready for notify_failure()!"); > + return -EIO; > + } > + > + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { > + xfs_warn(mp, > + "notify_failure() not supported on realtime device!"); > + return -EOPNOTSUPP; > + } > + > + if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && > + mp->m_logdev_targp != mp->m_ddev_targp) { > + xfs_err(mp, "ondisk log corrupt, shutting down fs!"); > + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); > + return -EFSCORRUPTED; > + } > + > + if (!xfs_has_rmapbt(mp)) { > + xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); > + return -EOPNOTSUPP; > + } > + > + ddev_start = mp->m_ddev_targp->bt_dax_part_off; > + ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; > + > + /* Ignore the range out of filesystem area */ > + if (offset + len < ddev_start) > + return -ENXIO; > + if (offset > ddev_end) > + return -ENXIO; > + > + /* Calculate the real range when it touches the boundary */ > + if (offset > ddev_start) > + offset -= ddev_start; > + else { > + len -= ddev_start - offset; > + offset = 0; > + } > + if (offset + len > ddev_end) > + len -= ddev_end - offset; > + > + return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), > + mf_flags); > +} > + > +const struct dax_holder_operations xfs_dax_holder_operations = { > + .notify_failure = xfs_dax_notify_failure, > +}; > diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h > index 167d23f92ffe..27ab5087d0b3 100644 > --- a/fs/xfs/xfs_super.h > +++ b/fs/xfs/xfs_super.h > @@ -93,6 +93,7 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, > extern const struct export_operations xfs_export_operations; > extern const struct xattr_handler *xfs_xattr_handlers[]; > extern const struct quotactl_ops xfs_quotactl_operations; > +extern const struct dax_holder_operations xfs_dax_holder_operations; > > extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); > > -- > 2.35.1 > > >