Add to XFS the ability to share arbitrary blocks between one file and another (reflink). The userspace ioctl uses the same interface as the btrfs ioctl. Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- fs/xfs/Makefile | 1 fs/xfs/libxfs/xfs_fs.h | 10 ++ fs/xfs/xfs_ioctl.c | 178 +++++++++++++++++++++++++++++ fs/xfs/xfs_ioctl32.c | 2 fs/xfs/xfs_reflink.c | 296 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_reflink.h | 24 ++++ 6 files changed, 511 insertions(+) create mode 100644 fs/xfs/xfs_reflink.c create mode 100644 fs/xfs/xfs_reflink.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index ba89aee..eb9dc8e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -87,6 +87,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_reflink.o \ xfs_super.o \ xfs_symlink.o \ xfs_sysfs.o \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 9fbdb86..92f21e1 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -560,6 +560,16 @@ typedef struct xfs_swapext #define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* reflink ioctls; these should match btrfs */ +struct xfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset; + __u64 src_length; + __u64 dest_offset; +}; + +#define XFS_IOC_CLONE _IOW (0x94, 9, int) +#define XFS_IOC_CLONE_RANGE _IOW (0x94, 13, struct xfs_ioctl_clone_range_args) #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ea7d85a..efc6e8d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -40,6 +40,7 @@ #include "xfs_symlink.h" #include "xfs_trans.h" #include "xfs_pnfs.h" +#include "xfs_reflink.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -48,6 +49,8 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/exportfs.h> +#include <linux/fsnotify.h> +#include <linux/security.h> /* * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to @@ -1502,6 +1505,145 @@ xfs_ioc_swapext( return error; } +static int +wait_for_io( + struct inode *inode, + loff_t offset, + size_t len) +{ + loff_t rounding; + loff_t ioffset; + loff_t iendoffset; + loff_t bs; + int ret; + + bs = inode->i_sb->s_blocksize; + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, bs, PAGE_CACHE_SIZE); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + iendoffset); + return ret; +} + +static int +xfs_ioctl_reflink( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + size_t len) +{ + struct inode *inode_in; + struct inode *inode_out; + ssize_t ret; + loff_t bs; + loff_t isize; + int same_inode; + loff_t blen; + + if (len == 0) + return 0; + else if (len != ~0ULL && (ssize_t)len < 0) + return -EINVAL; + + /* Do we have the correct permissions? */ + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EPERM; + ret = security_file_permission(file_out, MAY_WRITE); + if (ret) + return ret; + + inode_in = file_inode(file_in); + inode_out = file_inode(file_out); + bs = inode_out->i_sb->s_blocksize; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + if (IS_SWAPFILE(inode_in) || + IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Reflink only works within this filesystem. */ + if (inode_in->i_sb != inode_out->i_sb || + file_in->f_path.mnt != file_out->f_path.mnt) + return -EXDEV; + same_inode = (inode_in->i_ino == inode_out->i_ino); + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + return -ESPIPE; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) + return 0; + if (len == ~0ULL) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + return -EINVAL; + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + return -EINVAL; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) + return -EINVAL; + + ret = mnt_want_write_file(file_out); + if (ret) + return ret; + + /* Wait for the completion of any pending IOs on srcfile */ + ret = wait_for_io(inode_in, pos_in, len); + if (ret) + goto out_unlock; + ret = wait_for_io(inode_out, pos_out, len); + if (ret) + goto out_unlock; + + ret = xfs_reflink(XFS_I(inode_in), pos_in, XFS_I(inode_out), pos_out, len); + if (ret < 0) + goto out_unlock; + + /* Truncate the page cache so we don't see stale data */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_CACHE_ALIGN(pos_out + len) - 1); + +out_unlock: + if (ret == 0) { + fsnotify_access(file_in); + add_rchar(current, len); + fsnotify_modify(file_out); + add_wchar(current, len); + } + inc_syscr(current); + inc_syscw(current); + + mnt_drop_write_file(file_out); + return ret; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -1800,6 +1942,42 @@ xfs_file_ioctl( return xfs_icache_free_eofblocks(mp, &keofb); } + case XFS_IOC_CLONE: { + struct fd src; + + src = fdget(p); + if (!src.file) + return -EBADF; + + error = xfs_ioctl_reflink(src.file, 0, filp, 0, ~0ULL); + fdput(src); + if (error > 0) + error = 0; + + return error; + } + + case XFS_IOC_CLONE_RANGE: { + struct fd src; + struct xfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + src = fdget(args.src_fd); + if (!src.file) + return -EBADF; + if (args.src_length == 0) + args.src_length = ~0ULL; + + error = xfs_ioctl_reflink(src.file, args.src_offset, filp, + args.dest_offset, args.src_length); + fdput(src); + if (error > 0) + error = 0; + + return error; + } + default: return -ENOTTY; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index b88bdc8..76d8729 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -558,6 +558,8 @@ xfs_file_compat_ioctl( case XFS_IOC_GOINGDOWN: case XFS_IOC_ERROR_INJECTION: case XFS_IOC_ERROR_CLEARALL: + case XFS_IOC_CLONE: + case XFS_IOC_CLONE_RANGE: return xfs_file_ioctl(filp, cmd, p); #ifndef BROKEN_X86_ALIGNMENT /* These are handled fine if no alignment issues */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c new file mode 100644 index 0000000..ce5feeb --- /dev/null +++ b/fs/xfs/xfs_reflink.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2015 Oracle. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" +#include "xfs_reflink_btree.h" +#include "xfs_reflink.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" + +/** + * xfs_reflink() - link a range of blocks from one inode to another + * + * @src: Inode to clone from + * @srcoff: Offset within source to start clone from + * @dest: Inode to clone to + * @destoff: Offset within @inode to start clone + * @len: Original length, passed by user, of range to clone + */ +int /* error */ +xfs_reflink( + struct xfs_inode *src, /* XFS inode to copy extents from */ + xfs_off_t srcoff, /* offset in source file */ + struct xfs_inode *dest, /* XFS inode to copy extents to */ + xfs_off_t destoff,/* offset in destination file */ + xfs_off_t len) /* number of bytes to copy */ +{ + struct xfs_mount *mp = src->i_mount; + loff_t uninitialized_var(offset); + xfs_fileoff_t fsbno, dfsbno, fsbnext; + xfs_filblks_t end; + int error; + xfs_bmbt_irec_t imaps[1]; + int nimaps = 1; + uint resblks; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t map, dmap; + xfs_trans_t *tp; + int committed; + xfs_fsblock_t firstfsb; + struct xfs_buf *agbp; + xfs_agnumber_t agno; /* allocation group number */ + xfs_agblock_t agbno; + int done; + xfs_off_t blen = ALIGN(len, mp->m_sb.sb_blocksize); + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* For now, we won't reflink realtime inodes */ + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + return -EINVAL; + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) { + xfs_ilock(src, XFS_IOLOCK_EXCL); + xfs_ilock(src, XFS_MMAPLOCK_EXCL); + } else { + xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + } + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + dfsbno = XFS_B_TO_FSBT(mp, destoff); + fsbno = fsbnext = XFS_B_TO_FSBT(mp, srcoff); + end = XFS_B_TO_FSB(mp, srcoff + blen); + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + error = done = 0; + while (!error && !done) { + /* + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + error = xfs_trans_reserve_quota(tp, mp, + dest->i_udquot, dest->i_gdquot, dest->i_pdquot, + resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto error0; + + xfs_ilock(dest, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + + /* + * issue the bunmapi() call to free the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, dest, dfsbno, + XFS_B_TO_FSBT(mp, destoff + blen) - dfsbno, + 0, 2, &firstfsb, &free_list, &done); + if (error) + goto error1; + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error0; + + error = xfs_trans_commit(tp); + } + if (error) + goto out_unlock_io; + + while (end - fsbnext > 0) { + /* Read extent from the source file */ + nimaps = 1; + xfs_ilock(src, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(src, fsbnext, end - fsbnext, &map, + &nimaps, 0); + xfs_iunlock(src, XFS_ILOCK_EXCL); + if (error) + goto out_unlock_io; + + /* No extents at given offset, must be beyond EOF */ + if (nimaps == 0) + break; + + if (map.br_startblock == HOLESTARTBLOCK || + map.br_startblock == DELAYSTARTBLOCK) + goto next; + + /* Shrink the map to whatever we're linking */ + dmap = map; + dmap.br_startoff = dfsbno + dmap.br_startoff - fsbno; + nimaps = 1; + + /* + * Allocate and setup the transaction. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, dmap.br_blockcount * 2); + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, 0); + /* + * Check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + + xfs_ilock(dest, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &firstfsb); + + /* Update the refcount tree */ + agno = XFS_FSB_TO_AGNO(mp, dmap.br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, dmap.br_startblock); + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto error1; + error = xfs_reflinkbt_adjust_refcount(mp, tp, agbp, agno, agbno, + dmap.br_blockcount, 1); + if (error) + goto error1; + xfs_trans_brelse(tp, agbp); + + // XXX: should this be a separate transaction? + + /* Add this extent to the destination file */ + error = xfs_bmapi_write(tp, dest, dmap.br_startoff, + dmap.br_blockcount, + XFS_BMAPI_REFLINK, &dmap.br_startblock, + 0, &imaps[0], &nimaps, &free_list); + if (error) + goto error1; + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error0; + + error = xfs_trans_commit(tp); + if (error) + goto out_unlock_io; + + /* Keep going */ +next: + fsbnext = map.br_startoff + map.br_blockcount; + } + + /* Update inode size */ + if (destoff + len > i_size_read(VFS_I(dest))) { + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + + xfs_ilock(dest, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + + i_size_write(VFS_I(dest), destoff + len); + dest->i_d.di_size = destoff + len; + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); + if (error) + goto out_unlock_io; + } + + goto out_unlock_io; + +error1: + /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); +error0: + xfs_trans_cancel(tp); + +out_unlock_io: + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(src, XFS_IOLOCK_EXCL); + if (src->i_ino != dest->i_ino) { + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_IOLOCK_EXCL); + } + + return error; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h new file mode 100644 index 0000000..7cccd50 --- /dev/null +++ b/fs/xfs/xfs_reflink.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2015 Oracle. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_REFLINK_H +#define __XFS_REFLINK_H 1 + +extern int xfs_reflink(struct xfs_inode *src, xfs_off_t srcoff, + struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len); + +#endif /* __XFS_REFLINK_H */ _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs