From: Darrick J. Wong <djwong@xxxxxxxxxx> Introduce a variant on XFS_SCRUB_METADATA that allows for vectored mode. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- fs/xfs/libxfs/xfs_fs.h | 37 ++++++++++++ fs/xfs/scrub/scrub.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/trace.h | 78 ++++++++++++++++++++++++ fs/xfs/scrub/xfs_scrub.h | 2 + fs/xfs/xfs_ioctl.c | 47 +++++++++++++++ 5 files changed, 311 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 453b08612256..067dd0b1315b 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -751,6 +751,15 @@ struct xfs_scrub_metadata { /* Number of scrub subcommands. */ #define XFS_SCRUB_TYPE_NR 32 +/* + * This special type code only applies to the vectored scrub implementation. + * + * If any of the previous scrub vectors recorded runtime errors or have + * sv_flags bits set that match the OFLAG bits in the barrier vector's + * sv_flags, set the barrier's sv_ret to -ECANCELED and return to userspace. + */ +#define XFS_SCRUB_TYPE_BARRIER (-1U) + /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) @@ -795,6 +804,33 @@ struct xfs_scrub_metadata { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) +struct xfs_scrub_vec { + __u32 sv_type; /* XFS_SCRUB_TYPE_* */ + __u32 sv_flags; /* XFS_SCRUB_FLAGS_* */ + __s32 sv_ret; /* 0 or a negative error code */ + __u32 sv_reserved; /* must be zero */ +}; + +/* Vectored metadata scrub control structure. */ +struct xfs_scrub_vec_head { + __u64 svh_ino; /* inode number. */ + __u32 svh_gen; /* inode generation. */ + __u32 svh_agno; /* ag number. */ + __u32 svh_flags; /* XFS_SCRUB_VEC_FLAGS_* */ + __u16 svh_rest_us; /* wait this much time between vector items */ + __u16 svh_nr; /* number of svh_vecs */ + + struct xfs_scrub_vec svh_vecs[0]; +}; + +#define XFS_SCRUB_VEC_FLAGS_ALL (0) + +static inline size_t sizeof_xfs_scrub_vec(unsigned int nr) +{ + return sizeof(struct xfs_scrub_vec_head) + + nr * sizeof(struct xfs_scrub_vec); +} + /* * ioctl limits */ @@ -839,6 +875,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* XFS_IOC_GETFSMAP ------ hoisted 59 */ #define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) +#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 60, struct xfs_scrub_vec_head) #define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry) #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 62, struct xfs_rtgroup_geometry) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 342a50248650..fc2cfef68366 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -20,6 +20,7 @@ #include "xfs_rmap.h" #include "xfs_xchgrange.h" #include "xfs_swapext.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -726,3 +727,150 @@ xfs_scrub_metadata( sc->flags |= XCHK_TRY_HARDER; goto retry_op; } + +/* Decide if there have been any scrub failures up to this point. */ +static inline bool +xfs_scrubv_previous_failures( + struct xfs_mount *mp, + struct xfs_scrub_vec_head *vhead, + struct xfs_scrub_vec *barrier_vec) +{ + struct xfs_scrub_vec *v; + __u32 failmask; + + failmask = barrier_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; + + for (v = vhead->svh_vecs; v < barrier_vec; v++) { + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) + continue; + + /* + * Runtime errors count as a previous failure, except the ones + * used to ask userspace to retry. + */ + if (v->sv_ret && v->sv_ret != -EBUSY && v->sv_ret != -ENOENT && + v->sv_ret != -EUSERS) + return true; + + /* + * If any of the out-flags on the scrub vector match the mask + * that was set on the barrier vector, that's a previous fail. + */ + if (v->sv_flags & failmask) + return true; + } + + return false; +} + +/* Vectored scrub implementation to reduce ioctl calls. */ +int +xfs_scrubv_metadata( + struct file *file, + struct xfs_scrub_vec_head *vhead) +{ + struct xfs_inode *ip_in = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip_in->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_scrub_vec *v; + bool set_dontcache = false; + unsigned int i; + int error = 0; + + BUILD_BUG_ON(sizeof(struct xfs_scrub_vec_head) == + sizeof(struct xfs_scrub_metadata)); + BUILD_BUG_ON(XFS_IOC_SCRUB_METADATA == XFS_IOC_SCRUBV_METADATA); + + trace_xchk_scrubv_start(ip_in, vhead); + + if (vhead->svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) + return -EINVAL; + for (i = 0, v = vhead->svh_vecs; i < vhead->svh_nr; i++, v++) { + if (v->sv_reserved) + return -EINVAL; + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && + (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) + return -EINVAL; + + /* + * If we detect at least one inode-type scrub, we might + * consider setting dontcache at the end. + */ + if (v->sv_type < XFS_SCRUB_TYPE_NR && + meta_scrub_ops[v->sv_type].type == ST_INODE) + set_dontcache = true; + + trace_xchk_scrubv_item(mp, vhead, v); + } + + /* + * If the caller provided us with a nonzero inode number that isn't the + * ioctl file, try to grab a reference to it to eliminate all further + * untrusted inode lookups. If we can't get the inode, let each scrub + * function try again. + */ + if (vhead->svh_ino != ip_in->i_ino) { + xfs_iget(mp, NULL, vhead->svh_ino, XFS_IGET_UNTRUSTED, 0, &ip); + if (ip && (VFS_I(ip)->i_generation != vhead->svh_gen || + (xfs_is_metadata_inode(ip) && + !S_ISDIR(VFS_I(ip)->i_mode)))) { + xfs_irele(ip); + ip = NULL; + } + } + if (!ip) { + if (!igrab(VFS_I(ip_in))) + return -EFSCORRUPTED; + ip = ip_in; + } + + /* Run all the scrubbers. */ + for (i = 0, v = vhead->svh_vecs; i < vhead->svh_nr; i++, v++) { + struct xfs_scrub_metadata sm = { + .sm_type = v->sv_type, + .sm_flags = v->sv_flags, + .sm_ino = vhead->svh_ino, + .sm_gen = vhead->svh_gen, + .sm_agno = vhead->svh_agno, + }; + + if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { + if (xfs_scrubv_previous_failures(mp, vhead, v)) { + v->sv_ret = -ECANCELED; + trace_xchk_scrubv_barrier_fail(mp, vhead, v); + break; + } + + continue; + } + + v->sv_ret = xfs_scrub_metadata(file, &sm); + v->sv_flags = sm.sm_flags; + + /* Leave the inode in memory if something's wrong with it. */ + if (xchk_needs_repair(&sm)) + set_dontcache = false; + + if (vhead->svh_rest_us) { + ktime_t expires; + + expires = ktime_add_ns(ktime_get(), + vhead->svh_rest_us * 1000); + set_current_state(TASK_KILLABLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } + if (fatal_signal_pending(current)) { + error = -EINTR; + break; + } + } + + /* + * If we're holding the only reference to this inode and the scan was + * clean, mark it dontcache so that we don't pollute the cache. + */ + if (set_dontcache && atomic_read(&VFS_I(ip)->i_count) == 1) + d_mark_dontcache(VFS_I(ip)); + xfs_irele(ip); + return error; +} diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 0e945f842732..8767dd39b80c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -80,6 +80,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGBITMAP); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -113,7 +114,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT); { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \ { XFS_SCRUB_TYPE_RGBITMAP, "rgbitmap" }, \ { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \ - { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" } + { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" }, \ + { XFS_SCRUB_TYPE_BARRIER, "barrier" } const char *xchk_type_string(unsigned int type); @@ -213,6 +215,80 @@ DEFINE_EVENT(xchk_fshook_class, name, \ DEFINE_SCRUB_FSHOOK_EVENT(xchk_fshooks_enable); DEFINE_SCRUB_FSHOOK_EVENT(xchk_fshooks_disable); +DECLARE_EVENT_CLASS(xchk_vector_head_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), + TP_ARGS(ip, vhead), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, inum) + __field(unsigned int, gen) + __field(unsigned int, flags) + __field(unsigned short, rest_us) + __field(unsigned short, nr_vecs) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = vhead->svh_agno; + __entry->inum = vhead->svh_ino; + __entry->gen = vhead->svh_gen; + __entry->flags = vhead->svh_flags; + __entry->rest_us = vhead->svh_rest_us; + __entry->nr_vecs = vhead->svh_nr; + ), + TP_printk("dev %d:%d ino 0x%llx agno 0x%x inum 0x%llx gen 0x%x flags 0x%x rest_us %u nr_vecs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->inum, + __entry->gen, + __entry->flags, + __entry->rest_us, + __entry->nr_vecs) +) +#define DEFINE_SCRUBV_HEAD_EVENT(name) \ +DEFINE_EVENT(xchk_vector_head_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_vec_head *vhead), \ + TP_ARGS(ip, vhead)) + +DEFINE_SCRUBV_HEAD_EVENT(xchk_scrubv_start); + +DECLARE_EVENT_CLASS(xchk_vector_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, + struct xfs_scrub_vec *v), + TP_ARGS(mp, vhead, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, vec_nr) + __field(unsigned int, vec_type) + __field(unsigned int, vec_flags) + __field(int, vec_ret) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->vec_nr = v - vhead->svh_vecs; + __entry->vec_type = v->sv_type; + __entry->vec_flags = v->sv_flags; + __entry->vec_ret = v->sv_ret; + ), + TP_printk("dev %d:%d vec[%u] type %s flags 0x%x ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->vec_nr, + __print_symbolic(__entry->vec_type, XFS_SCRUB_TYPE_STRINGS), + __entry->vec_flags, + __entry->vec_ret) +) +#define DEFINE_SCRUBV_EVENT(name) \ +DEFINE_EVENT(xchk_vector_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_scrub_vec_head *vhead, \ + struct xfs_scrub_vec *v), \ + TP_ARGS(mp, vhead, v)) + +DEFINE_SCRUBV_EVENT(xchk_scrubv_barrier_fail); +DEFINE_SCRUBV_EVENT(xchk_scrubv_item); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index 2ceae614ade8..bdf89242e6cd 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -8,8 +8,10 @@ #ifndef CONFIG_XFS_ONLINE_SCRUB # define xfs_scrub_metadata(file, sm) (-ENOTTY) +# define xfs_scrubv_metadata(file, vhead) (-ENOTTY) #else int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm); +int xfs_scrubv_metadata(struct file *file, struct xfs_scrub_vec_head *vhead); #endif /* CONFIG_XFS_ONLINE_SCRUB */ #endif /* __XFS_SCRUB_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index abca384c86a4..47704a7854cf 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1643,6 +1643,51 @@ xfs_ioc_scrub_metadata( return 0; } +STATIC int +xfs_ioc_scrubv_metadata( + struct file *filp, + void __user *arg) +{ + struct xfs_scrub_vec_head __user *uhead = arg; + struct xfs_scrub_vec_head head; + struct xfs_scrub_vec_head *vhead; + size_t bytes; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&head, uhead, sizeof(head))) + return -EFAULT; + + bytes = sizeof_xfs_scrub_vec(head.svh_nr); + if (bytes > PAGE_SIZE) + return -ENOMEM; + vhead = kvmalloc(bytes, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!vhead) + return -ENOMEM; + memcpy(vhead, &head, sizeof(struct xfs_scrub_vec_head)); + + if (copy_from_user(&vhead->svh_vecs, &uhead->svh_vecs, + head.svh_nr * sizeof(struct xfs_scrub_vec))) { + error = -EFAULT; + goto err_free; + } + + error = xfs_scrubv_metadata(filp, vhead); + if (error) + goto err_free; + + if (copy_to_user(uhead, vhead, bytes)) { + error = -EFAULT; + goto err_free; + } + +err_free: + kvfree(vhead); + return error; +} + int xfs_ioc_swapext( struct xfs_swapext *sxp) @@ -1908,6 +1953,8 @@ xfs_file_ioctl( case FS_IOC_GETFSMAP: return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_SCRUBV_METADATA: + return xfs_ioc_scrubv_metadata(filp, arg); case XFS_IOC_SCRUB_METADATA: return xfs_ioc_scrub_metadata(filp, arg);