From: Darrick J. Wong <djwong@xxxxxxxxxx> So far we've constructed the top half of file range exchange which deals with VFS-level objects; and the bottom half of extent swapping, which deals with file mappings in XFS data structures. We still need to glue the two pieces together so do that now. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- fs/xfs/xfs_bmap_util.c | 1 fs/xfs/xfs_mount.h | 5 - fs/xfs/xfs_trace.c | 1 fs/xfs/xfs_trace.h | 127 +++++++++++++ fs/xfs/xfs_xchgrange.c | 462 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_xchgrange.h | 28 +++ 6 files changed, 622 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 892622530431a..87ac9777c1eaf 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -29,6 +29,7 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_rtbitmap.h" +#include "xfs_swapext.h" /* Kernel only BMAP related definitions and functions */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 257bef8019307..f80f04335acc5 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -412,6 +412,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_LARP 9 /* Mount time quotacheck is running */ #define XFS_OPSTATE_QUOTACHECK_RUNNING 10 +/* Kernel has logged a warning about extent swapping being used on this fs. */ +#define XFS_OPSTATE_WARNED_SWAPEXT 11 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -457,7 +459,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ - { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } + { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ + { (1UL << XFS_OPSTATE_WARNED_SWAPEXT), "wswapext" } /* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index b43b973f0e102..e38814f4380c8 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -41,6 +41,7 @@ #include "xfs_btree_mem.h" #include "xfs_bmap.h" #include "xfs_swapext.h" +#include "xfs_xchgrange.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 893f69a2308ca..53a6122d307ff 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3750,11 +3750,138 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); + +/* swapext tracepoints */ +DEFINE_INODE_ERROR_EVENT(xfs_file_xchg_range_error); DEFINE_INODE_IREC_EVENT(xfs_swapext_extent1_skip); DEFINE_INODE_IREC_EVENT(xfs_swapext_extent1); DEFINE_INODE_IREC_EVENT(xfs_swapext_extent2); DEFINE_ITRUNC_EVENT(xfs_swapext_update_inode_size); +#define XFS_EXCH_RANGE_FLAGS_STRS \ + { XFS_EXCH_RANGE_NONATOMIC, "NONATOMIC" }, \ + { XFS_EXCH_RANGE_FILE2_FRESH, "F2_FRESH" }, \ + { XFS_EXCH_RANGE_FULL_FILES, "FULL" }, \ + { XFS_EXCH_RANGE_TO_EOF, "TO_EOF" }, \ + { XFS_EXCH_RANGE_FSYNC , "FSYNC" }, \ + { XFS_EXCH_RANGE_DRY_RUN, "DRY_RUN" }, \ + { XFS_EXCH_RANGE_FILE1_WRITTEN, "F1_WRITTEN" } + +/* file exchange-range tracepoint class */ +DECLARE_EVENT_CLASS(xfs_xchg_range_class, + TP_PROTO(struct xfs_inode *ip1, const struct xfs_exch_range *fxr, + struct xfs_inode *ip2, unsigned int xchg_flags), + TP_ARGS(ip1, fxr, ip2, xchg_flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ip1_ino) + __field(loff_t, ip1_isize) + __field(loff_t, ip1_disize) + __field(xfs_ino_t, ip2_ino) + __field(loff_t, ip2_isize) + __field(loff_t, ip2_disize) + + __field(loff_t, file1_offset) + __field(loff_t, file2_offset) + __field(unsigned long long, length) + __field(unsigned long long, vflags) + __field(unsigned int, xflags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip1)->i_sb->s_dev; + __entry->ip1_ino = ip1->i_ino; + __entry->ip1_isize = VFS_I(ip1)->i_size; + __entry->ip1_disize = ip1->i_disk_size; + __entry->ip2_ino = ip2->i_ino; + __entry->ip2_isize = VFS_I(ip2)->i_size; + __entry->ip2_disize = ip2->i_disk_size; + + __entry->file1_offset = fxr->file1_offset; + __entry->file2_offset = fxr->file2_offset; + __entry->length = fxr->length; + __entry->vflags = fxr->flags; + __entry->xflags = xchg_flags; + ), + TP_printk("dev %d:%d vfs_flags %s xchg_flags %s bytecount 0x%llx " + "ino1 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> " + "ino2 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->vflags, "|", XFS_EXCH_RANGE_FLAGS_STRS), + __print_flags(__entry->xflags, "|", XCHG_RANGE_FLAGS_STRS), + __entry->length, + __entry->ip1_ino, + __entry->ip1_isize, + __entry->ip1_disize, + __entry->file1_offset, + __entry->ip2_ino, + __entry->ip2_isize, + __entry->ip2_disize, + __entry->file2_offset) +) + +#define DEFINE_XCHG_RANGE_EVENT(name) \ +DEFINE_EVENT(xfs_xchg_range_class, name, \ + TP_PROTO(struct xfs_inode *ip1, const struct xfs_exch_range *fxr, \ + struct xfs_inode *ip2, unsigned int xchg_flags), \ + TP_ARGS(ip1, fxr, ip2, xchg_flags)) +DEFINE_XCHG_RANGE_EVENT(xfs_xchg_range_prep); +DEFINE_XCHG_RANGE_EVENT(xfs_xchg_range_flush); +DEFINE_XCHG_RANGE_EVENT(xfs_xchg_range); + +TRACE_EVENT(xfs_xchg_range_freshness, + TP_PROTO(struct xfs_inode *ip2, const struct xfs_exch_range *fxr), + TP_ARGS(ip2, fxr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ip2_ino) + __field(long long, ip2_mtime) + __field(long long, ip2_ctime) + __field(int, ip2_mtime_nsec) + __field(int, ip2_ctime_nsec) + + __field(xfs_ino_t, file2_ino) + __field(long long, file2_mtime) + __field(long long, file2_ctime) + __field(int, file2_mtime_nsec) + __field(int, file2_ctime_nsec) + ), + TP_fast_assign( + struct timespec64 ts64; + struct inode *inode2 = VFS_I(ip2); + + __entry->dev = inode2->i_sb->s_dev; + __entry->ip2_ino = ip2->i_ino; + + ts64 = inode_get_ctime(inode2); + __entry->ip2_ctime = ts64.tv_sec; + __entry->ip2_ctime_nsec = ts64.tv_nsec; + + ts64 = inode_get_mtime(inode2); + __entry->ip2_mtime = ts64.tv_sec; + __entry->ip2_mtime_nsec = ts64.tv_nsec; + + __entry->file2_ino = fxr->file2_ino; + __entry->file2_mtime = fxr->file2_mtime; + __entry->file2_ctime = fxr->file2_ctime; + __entry->file2_mtime_nsec = fxr->file2_mtime_nsec; + __entry->file2_ctime_nsec = fxr->file2_ctime_nsec; + ), + TP_printk("dev %d:%d " + "ino 0x%llx mtime %lld:%d ctime %lld:%d -> " + "file 0x%llx mtime %lld:%d ctime %lld:%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ip2_ino, + __entry->ip2_mtime, + __entry->ip2_mtime_nsec, + __entry->ip2_ctime, + __entry->ip2_ctime_nsec, + __entry->file2_ino, + __entry->file2_mtime, + __entry->file2_mtime_nsec, + __entry->file2_ctime, + __entry->file2_ctime_nsec) +); + /* fsmap traces */ DECLARE_EVENT_CLASS(xfs_fsmap_class, TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, diff --git a/fs/xfs/xfs_xchgrange.c b/fs/xfs/xfs_xchgrange.c index ebbd7b1fe3983..835e83c90f7f5 100644 --- a/fs/xfs/xfs_xchgrange.c +++ b/fs/xfs/xfs_xchgrange.c @@ -12,8 +12,15 @@ #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" +#include "xfs_quota.h" +#include "xfs_bmap_util.h" +#include "xfs_reflink.h" +#include "xfs_trace.h" #include "xfs_swapext.h" #include "xfs_xchgrange.h" +#include "xfs_sb.h" +#include "xfs_icache.h" +#include "xfs_log.h" #include <linux/fsnotify.h> /* @@ -320,7 +327,7 @@ __xfs_exch_range( if (ret) return ret; - ret = -EOPNOTSUPP; /* XXX call out to xfs code */ + ret = xfs_file_xchg_range(file1, file2, fxr); if (ret) return ret; @@ -347,6 +354,78 @@ xfs_exch_range( /* XFS-specific parts of XFS_IOC_EXCHANGE_RANGE */ +/* + * Exchanging ranges as a file operation. This is the binding between the + * VFS-level concepts and the XFS-specific implementation. + */ +int +xfs_file_xchg_range( + struct file *file1, + struct file *file2, + struct xfs_exch_range *fxr) +{ + struct inode *inode1 = file_inode(file1); + struct inode *inode2 = file_inode(file2); + struct xfs_inode *ip1 = XFS_I(inode1); + struct xfs_inode *ip2 = XFS_I(inode2); + struct xfs_mount *mp = ip1->i_mount; + unsigned int priv_flags = 0; + bool use_logging = false; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + + /* Update cmtime if the fd/inode don't forbid it. */ + if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))) + priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1; + if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))) + priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2; + + /* Lock both files against IO */ + error = xfs_ilock2_io_mmap(ip1, ip2); + if (error) + goto out_err; + + /* Get permission to use log-assisted file content swaps. */ + error = xfs_xchg_range_grab_log_assist(mp, + !(fxr->flags & XFS_EXCH_RANGE_NONATOMIC), + &use_logging); + if (error) + goto out_unlock; + if (use_logging) + priv_flags |= XFS_XCHG_RANGE_LOGGED; + + /* Prepare and then exchange file contents. */ + error = xfs_xchg_range_prep(file1, file2, fxr); + if (error) + goto out_drop_feat; + + error = xfs_xchg_range(ip1, ip2, fxr, priv_flags); + if (error) + goto out_drop_feat; + + /* + * Finish the exchange by removing special file privileges like any + * other file write would do. This may involve turning on support for + * logged xattrs if either file has security capabilities, which means + * xfs_xchg_range_grab_log_assist before xfs_attr_grab_log_assist. + */ + error = xfs_exch_range_finish(file1, file2); + if (error) + goto out_drop_feat; + +out_drop_feat: + if (use_logging) + xfs_xchg_range_rele_log_assist(mp); +out_unlock: + xfs_iunlock2_io_mmap(ip1, ip2); +out_err: + if (error) + trace_xfs_file_xchg_range_error(ip2, error, _RET_IP_); + return error; +} + /* Lock (and optionally join) two inodes for a file range exchange. */ void xfs_xchg_range_ilock( @@ -394,3 +473,384 @@ xfs_xchg_range_estimate( xfs_xchg_range_iunlock(req->ip1, req->ip2); return error; } + +/* Prepare two files to have their data exchanged. */ +int +xfs_xchg_range_prep( + struct file *file1, + struct file *file2, + struct xfs_exch_range *fxr) +{ + struct xfs_inode *ip1 = XFS_I(file_inode(file1)); + struct xfs_inode *ip2 = XFS_I(file_inode(file2)); + int error; + + trace_xfs_xchg_range_prep(ip1, fxr, ip2, 0); + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) + return -EINVAL; + + /* + * The alignment checks in the VFS helpers cannot deal with allocation + * units that are not powers of 2. This can happen with the realtime + * volume if the extent size is set. Note that alignment checks are + * skipped if FULL_FILES is set. + */ + if (!(fxr->flags & XFS_EXCH_RANGE_FULL_FILES) && + !is_power_of_2(xfs_inode_alloc_unitsize(ip2))) + return -EOPNOTSUPP; + + error = xfs_exch_range_prep(file1, file2, fxr, + xfs_inode_alloc_unitsize(ip2)); + if (error || fxr->length == 0) + return error; + + /* Attach dquots to both inodes before changing block maps. */ + error = xfs_qm_dqattach(ip2); + if (error) + return error; + error = xfs_qm_dqattach(ip1); + if (error) + return error; + + trace_xfs_xchg_range_flush(ip1, fxr, ip2, 0); + + /* Flush the relevant ranges of both files. */ + error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); + if (error) + return error; + error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); + if (error) + return error; + + /* + * Cancel CoW fork preallocations for the ranges of both files. The + * prep function should have flushed all the dirty data, so the only + * extents remaining should be speculative. + */ + if (xfs_inode_has_cow_data(ip1)) { + error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset, + fxr->length, true); + if (error) + return error; + } + + if (xfs_inode_has_cow_data(ip2)) { + error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset, + fxr->length, true); + if (error) + return error; + } + + return 0; +} + +#define QRETRY_IP1 (0x1) +#define QRETRY_IP2 (0x2) + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xfs_xchg_range_reserve_quota( + struct xfs_trans *tp, + const struct xfs_swapext_req *req, + unsigned int *qretry) +{ + int64_t ddelta, rdelta; + int ip1_error = 0; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + *qretry = 0; + + /* + * For each file, compute the net gain in the number of regular blocks + * that will be mapped into that file and reserve that much quota. The + * quota counts must be able to absorb at least that much space. + */ + ddelta = req->ip2_bcount - req->ip1_bcount; + rdelta = req->ip2_rtbcount - req->ip1_rtbcount; + if (ddelta > 0 || rdelta > 0) { + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta > 0 ? ddelta : 0, + rdelta > 0 ? rdelta : 0, + false); + if (error == -EDQUOT || error == -ENOSPC) { + /* + * Save this error and see what happens if we try to + * reserve quota for ip2. Then report both. + */ + *qretry |= QRETRY_IP1; + ip1_error = error; + error = 0; + } + if (error) + return error; + } + if (ddelta < 0 || rdelta < 0) { + error = xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta < 0 ? -ddelta : 0, + rdelta < 0 ? -rdelta : 0, + false); + if (error == -EDQUOT || error == -ENOSPC) + *qretry |= QRETRY_IP2; + if (error) + return error; + } + if (ip1_error) + return ip1_error; + + /* + * For each file, forcibly reserve the gross gain in mapped blocks so + * that we don't trip over any quota block reservation assertions. + * We must reserve the gross gain because the quota code subtracts from + * bcount the number of blocks that we unmap; it does not add that + * quantity back to the quota block reservation. + */ + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount, + req->ip1_rtbcount, true); + if (error) + return error; + + return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount, + req->ip2_rtbcount, true); +} + +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must hold the IOLOCK and MMAPLOCK of both files. They must not be + * running any transactions or hold any ILOCKS. If @use_logging is set after a + * successful return, callers must call xfs_xchg_range_rele_log_assist after + * the exchange is completed. + */ +int +xfs_xchg_range_grab_log_assist( + struct xfs_mount *mp, + bool force, + bool *use_logging) +{ + int error = 0; + + /* + * As a performance optimization, skip the log force and super write + * if the filesystem featureset already protects the swapext log items. + */ + if (xfs_swapext_can_use_without_log_assistance(mp)) { + *use_logging = true; + return 0; + } + + /* + * Protect ourselves from an idle log clearing the atomic swapext + * log incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT); + *use_logging = true; + + /* + * If log-assisted swapping is already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogswapext(&mp->m_sb)) + return 0; + + /* + * If the caller doesn't /require/ log-assisted swapping, drop the + * incore log-incompat feature protection and exit. The caller will + * not be able to use log assisted swapping. + */ + if (!force) + goto drop_incompat; + + /* + * Check if the filesystem featureset is new enough to set this log + * incompat feature bit. Strictly speaking, the minimum requirement is + * a V5 filesystem for the superblock field, but we'll require bigtime + * to avoid having to deal with really old kernels. + */ + if (!xfs_has_bigtime(mp)) { + error = -EOPNOTSUPP; + goto drop_incompat; + } + + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT); + if (error) + goto drop_incompat; + + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SWAPEXT, + "EXPERIMENTAL atomic file range swap feature in use. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT); + *use_logging = false; + return error; +} + +/* Release permission to use log-assisted extent swapping. */ +void +xfs_xchg_range_rele_log_assist( + struct xfs_mount *mp) +{ + if (!xfs_swapext_can_use_without_log_assistance(mp)) + xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT); +} + +/* Exchange the contents of two files. */ +int +xfs_xchg_range( + struct xfs_inode *ip1, + struct xfs_inode *ip2, + const struct xfs_exch_range *fxr, + unsigned int xchg_flags) +{ + struct xfs_mount *mp = ip1->i_mount; + struct xfs_swapext_req req = { + .ip1 = ip1, + .ip2 = ip2, + .whichfork = XFS_DATA_FORK, + .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset), + .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset), + .blockcount = XFS_B_TO_FSB(mp, fxr->length), + }; + struct xfs_trans *tp; + unsigned int qretry; + bool retried = false; + int error; + + trace_xfs_xchg_range(ip1, fxr, ip2, xchg_flags); + + /* + * This function only supports using log intent items (SXI items if + * atomic exchange is required, or BUI items if not) to exchange file + * data. The legacy whole-fork swap will be ported in a later patch. + */ + if (!(xchg_flags & XFS_XCHG_RANGE_LOGGED) && + !xfs_swapext_supports_nonatomic(mp)) + return -EOPNOTSUPP; + + if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) + req.req_flags |= XFS_SWAP_REQ_SET_SIZES; + if (fxr->flags & XFS_EXCH_RANGE_FILE1_WRITTEN) + req.req_flags |= XFS_SWAP_REQ_INO1_WRITTEN; + if (xchg_flags & XFS_XCHG_RANGE_LOGGED) + req.req_flags |= XFS_SWAP_REQ_LOGGED; + + error = xfs_xchg_range_estimate(&req); + if (error) + return error; + +retry: + /* Allocate the transaction, lock the inodes, and join them. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0, + XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + + xfs_xchg_range_ilock(tp, ip1, ip2); + + trace_xfs_swap_extent_before(ip2, 0); + trace_xfs_swap_extent_before(ip1, 1); + + if (fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH) + trace_xfs_xchg_range_freshness(ip2, fxr); + + /* + * Now that we've excluded all other inode metadata changes by taking + * the ILOCK, repeat the freshness check. + */ + error = xfs_exch_range_check_fresh(VFS_I(ip2), fxr); + if (error) + goto out_trans_cancel; + + error = xfs_swapext_check_extents(mp, &req); + if (error) + goto out_trans_cancel; + + /* + * Reserve ourselves some quota if any of them are in enforcing mode. + * In theory we only need enough to satisfy the change in the number + * of blocks between the two ranges being remapped. + */ + error = xfs_xchg_range_reserve_quota(tp, &req, &qretry); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_xchg_range_iunlock(ip1, ip2); + if (qretry & QRETRY_IP1) + xfs_blockgc_free_quota(ip1, 0); + if (qretry & QRETRY_IP2) + xfs_blockgc_free_quota(ip2, 0); + retried = true; + goto retry; + } + if (error) + goto out_trans_cancel; + + /* If we got this far on a dry run, all parameters are ok. */ + if (fxr->flags & XFS_EXCH_RANGE_DRY_RUN) + goto out_trans_cancel; + + /* Update the mtime and ctime of both files. */ + if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME1) + xfs_trans_ichgtime(tp, ip1, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME2) + xfs_trans_ichgtime(tp, ip2, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + xfs_swapext(tp, &req); + + /* + * Force the log to persist metadata updates if the caller or the + * administrator requires this. The VFS prep function already flushed + * the relevant parts of the page cache. + */ + if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCH_RANGE_FSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp); + + trace_xfs_swap_extent_after(ip2, 0); + trace_xfs_swap_extent_after(ip1, 1); + + if (error) + goto out_unlock; + + /* + * If the caller wanted us to exchange the contents of two complete + * files of unequal length, exchange the incore sizes now. This should + * be safe because we flushed both files' page caches, moved all the + * extents, and updated the ondisk sizes. + */ + if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) { + loff_t temp; + + temp = i_size_read(VFS_I(ip2)); + i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); + i_size_write(VFS_I(ip1), temp); + } + +out_unlock: + xfs_xchg_range_iunlock(ip1, ip2); + return error; + +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} diff --git a/fs/xfs/xfs_xchgrange.h b/fs/xfs/xfs_xchgrange.h index 3544ed84e4106..3471182d1402f 100644 --- a/fs/xfs/xfs_xchgrange.h +++ b/fs/xfs/xfs_xchgrange.h @@ -15,6 +15,11 @@ int xfs_exch_range_finish(struct file *file1, struct file *file2); int xfs_exch_range(struct file *file1, struct file *file2, struct xfs_exch_range *fxr); +/* Binding between the generic VFS and the XFS-specific file exchange */ + +int xfs_file_xchg_range(struct file *file1, struct file *file2, + struct xfs_exch_range *fxr); + /* XFS-specific parts of file exchanges */ struct xfs_swapext_req; @@ -25,4 +30,27 @@ void xfs_xchg_range_iunlock(struct xfs_inode *ip1, struct xfs_inode *ip2); int xfs_xchg_range_estimate(struct xfs_swapext_req *req); +int xfs_xchg_range_grab_log_assist(struct xfs_mount *mp, bool force, + bool *use_logging); +void xfs_xchg_range_rele_log_assist(struct xfs_mount *mp); + +/* Caller has permission to use log intent items for the exchange operation. */ +#define XFS_XCHG_RANGE_LOGGED (1U << 0) + +/* Update ip1's change and mod time. */ +#define XFS_XCHG_RANGE_UPD_CMTIME1 (1U << 1) + +/* Update ip2's change and mod time. */ +#define XFS_XCHG_RANGE_UPD_CMTIME2 (1U << 2) + +#define XCHG_RANGE_FLAGS_STRS \ + { XFS_XCHG_RANGE_LOGGED, "LOGGED" }, \ + { XFS_XCHG_RANGE_UPD_CMTIME1, "UPD_CMTIME1" }, \ + { XFS_XCHG_RANGE_UPD_CMTIME2, "UPD_CMTIME2" } + +int xfs_xchg_range(struct xfs_inode *ip1, struct xfs_inode *ip2, + const struct xfs_exch_range *fxr, unsigned int xchg_flags); +int xfs_xchg_range_prep(struct file *file1, struct file *file2, + struct xfs_exch_range *fxr); + #endif /* __XFS_XCHGRANGE_H__ */