[PATCH v3 2/7] xfs: add support FALLOC_FL_COLLAPSE_RANGE for fallocate

Namjae Jeon <linkinjeon@xxxxxxxxx> · Sun, 8 Sep 2013 22:41:50 +0900

From: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>

Add support FALLOC_FL_COLLAPSE_RANGE for fallocate.

Signed-off-by: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>
Signed-off-by: Ashish Sangwan <a.sangwan@xxxxxxxxxxx>
---
 fs/xfs/xfs_bmap.c      |  174 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap.h      |    3 +
 fs/xfs/xfs_bmap_util.c |   96 ++++++++++++++++++++++++++
 fs/xfs/xfs_bmap_util.h |    2 +
 fs/xfs/xfs_file.c      |   20 ++++--
 fs/xfs/xfs_fs.h        |    6 ++
 6 files changed, 296 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 92b8309..c12358e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5356,3 +5356,177 @@ error0:
 	}
 	return error;
 }
+
+/*
+ * Update extents by shifting them downwards into a hole.
+ * At max count number of extents will be shifted and *current_ext
+ * is the extent number which is currently being shifted.
+ * This function will return error if the hole is not present
+ * while shifting extents. On success, 0 is returned.
+ */
+int
+xfs_bmap_shift_extents(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			*done,
+	xfs_fileoff_t		start_fsb,
+	xfs_fileoff_t		shift,
+	xfs_extnum_t		*current_ext,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_bmap_free	*flist,
+	int			count)
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_bmbt_irec		left;
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_ifork		*ifp;
+	xfs_extnum_t			nexts = 0;
+	xfs_fileoff_t			startoff;
+	int				error = 0;
+	int				i;
+	int				whichfork = XFS_DATA_FORK;
+	int				state;
+	int				logflags;
+	xfs_filblks_t			blockcount = 0;
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+				 XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		/* Read in all the extents */
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	if (!*current_ext) {
+		gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+		/*
+		 * gotp can be null in 2 cases: 1) if there are no extents
+		 * or 2) start_fsb lies in a hole beyond which there are
+		 * no extents. Either way, we are done.
+		 */
+		if (!gotp) {
+			*done = 1;
+			return 0;
+		}
+	}
+
+	/* We are going to change core inode */
+	logflags = XFS_ILOG_CORE;
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+		}
+	else {
+		cur = NULL;
+		logflags |= XFS_ILOG_DEXT;
+	}
+
+	while (nexts++ < count &&
+	       *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
+		state = 0;
+
+		gotp = xfs_iext_get_ext(ifp, *current_ext);
+		startoff = xfs_bmbt_get_startoff(gotp);
+		startoff -= shift;
+
+		/*
+		 * Before shifting extent into hole, make sure that the hole
+		 * is large enough to accomodate the shift.
+		 */
+		if (*current_ext) {
+			state |= BMAP_LEFT_VALID;
+			xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+						*current_ext - 1), &left);
+
+			if (isnullstartblock(left.br_startblock))
+				state |= BMAP_LEFT_DELAY;
+
+			if (startoff < left.br_startoff + left.br_blockcount)
+				error = XFS_ERROR(EFSCORRUPTED);
+
+		} else if (startoff > xfs_bmbt_get_startoff(gotp))
+			/* Hole is at the start but not large enough */
+			error = XFS_ERROR(EFSCORRUPTED);
+
+		if (error)
+			goto del_cursor;
+
+		/* Check if we can merge 2 adjacent extents */
+		if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+		    left.br_startoff + left.br_blockcount == startoff &&
+		    left.br_startblock + left.br_blockcount ==
+		    xfs_bmbt_get_startblock(gotp) &&
+		    xfs_bmbt_get_state(gotp) == left.br_state &&
+		    left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <=
+		    MAXEXTLEN) {
+			blockcount =
+			left.br_blockcount + xfs_bmbt_get_blockcount(gotp);
+			state |= BMAP_LEFT_CONTIG;
+			xfs_iext_remove(ip, *current_ext, 1, 0);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+			gotp = xfs_iext_get_ext(ifp, --*current_ext);
+		}
+
+		if (cur) {
+			error = xfs_bmbt_lookup_eq(cur,
+					xfs_bmbt_get_startoff(gotp),
+					xfs_bmbt_get_startblock(gotp),
+					xfs_bmbt_get_blockcount(gotp),
+					&i);
+			if (error)
+				goto del_cursor;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+		}
+
+		if (state & BMAP_LEFT_CONTIG) {
+			/* We have to update extent block count */
+			xfs_bmbt_set_blockcount(gotp, blockcount);
+		} else {
+			/* We have to update the startoff */
+			xfs_bmbt_set_startoff(gotp, startoff);
+		}
+
+		if (cur) {
+			error = xfs_bmbt_update(cur,
+						xfs_bmbt_get_startoff(gotp),
+						xfs_bmbt_get_startblock(gotp),
+						xfs_bmbt_get_blockcount(gotp),
+						xfs_bmbt_get_state(gotp));
+			if (error)
+				goto del_cursor;
+		}
+
+		(*current_ext)++;
+	}
+
+	/* Check if we are done */
+	if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, whichfork))
+		*done = 1;
+
+del_cursor:
+	if (cur)
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	xfs_trans_log_inode(tp, ip, logflags);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f3..b16ebfa 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -169,5 +169,8 @@ int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
+int	xfs_bmap_shift_extents(struct xfs_trans *, struct xfs_inode *,
+			int *, xfs_fileoff_t, xfs_fileoff_t, xfs_extnum_t *,
+			xfs_fsblock_t *, struct xfs_bmap_free *, int);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 541d59f..57f045e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1556,6 +1556,7 @@ xfs_change_file_space(
 	case XFS_IOC_RESVSP64:
 	case XFS_IOC_UNRESVSP:
 	case XFS_IOC_UNRESVSP64:
+	case XFS_COLLAPSE_RANGE:
 		if (bf->l_len <= 0)
 			return XFS_ERROR(EINVAL);
 		break;
@@ -1638,6 +1639,12 @@ xfs_change_file_space(
 
 		clrprealloc = 1;
 		break;
+	case XFS_COLLAPSE_RANGE:
+		error = xfs_collapse_file_space(ip, startoffset, bf->l_len,
+						attr_flags);
+		if (error)
+			return error;
+		break;
 
 	default:
 		ASSERT(0);
@@ -1683,6 +1690,95 @@ xfs_change_file_space(
 	return xfs_trans_commit(tp, 0);
 }
 
+
+/*
+ * xfs_collapse_file_space: Implements the FALLOC_FL_COLLAPSE_SPACE flag.
+ */
+int
+xfs_collapse_file_space(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			len,
+	int			attr_flags)
+{
+	int			done = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			resblks;
+	struct xfs_trans	*tp;
+	int			error;
+	xfs_extnum_t		current_ext = 0;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	int			committed;
+	xfs_fileoff_t	start_fsb = XFS_B_TO_FSB(mp, offset + len);
+	xfs_fileoff_t	shift_fsb = XFS_B_TO_FSB(mp, len);
+
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+	/*
+	 * The first thing we do is to free data blocks in the specified range
+	 * by calling xfs_free_file_space(). It would also sync dirty data
+	 * and invalidate page cache over the region on which collapse range
+	 * is working.
+	 */
+
+	error = xfs_free_file_space(ip, offset, len, attr_flags);
+	if (error)
+		return error;
+
+	while (!error && !done) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		tp->t_flags |= XFS_TRANS_RESERVE;
+		/*
+		 * We would need to reserve permanent block for transaction.
+		 * This will come into picture when after shifting extent into
+		 * hole we found that adjacent extents can be merged which
+		 * may lead to freeing of a block during record update.
+		 */
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+		if (error) {
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_trans_reserve_quota(tp, mp,
+				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
+				resblks, 0, XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto out;
+
+		xfs_trans_ijoin(tp, ip, 0);
+
+		xfs_bmap_init(&free_list, &first_block);
+
+		/*
+		 * We are using the write transaction in which max 2 bmbt
+		 * updates are allowed
+		 */
+		error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+				shift_fsb, &current_ext,
+				&first_block, &free_list, 2);
+		if (error)
+			goto out;
+
+		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		if (error)
+			goto out;
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	return error;
+
+out:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
 /*
  * We need to check that the format of the data fork in the temporary inode is
  * valid for the target inode before doing the swap. This is not a problem with
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0612609..588d29d 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -97,6 +97,8 @@ int	xfs_change_file_space(struct xfs_inode *ip, int cmd,
 			      xfs_flock64_t *bf, xfs_off_t offset,
 			      int attr_flags);
 
+int xfs_collapse_file_space(struct xfs_inode *, loff_t, loff_t, int);
+
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
 int	xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 818c623..9c9c1ff 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -807,7 +807,8 @@ xfs_file_fallocate(
 	int		cmd = XFS_IOC_RESVSP;
 	int		attr_flags = XFS_ATTR_NOLOCK;
 
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_COLLAPSE_RANGE))
 		return -EOPNOTSUPP;
 
 	bf.l_whence = 0;
@@ -819,10 +820,19 @@ xfs_file_fallocate(
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		cmd = XFS_IOC_UNRESVSP;
 
-	/* check the new inode size is valid before allocating */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode)) {
+	/* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */
+	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+		cmd = XFS_COLLAPSE_RANGE;
+		if ((offset + len) > i_size_read(inode))
+			new_size = offset;
+		else
+			new_size = i_size_read(inode) - len;
+	} else if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode))
 		new_size = offset + len;
+
+	/* check the new inode size is valid before allocating */
+	if (new_size || mode & FALLOC_FL_COLLAPSE_RANGE) {
 		error = inode_newsize_ok(inode, new_size);
 		if (error)
 			goto out_unlock;
@@ -836,7 +846,7 @@ xfs_file_fallocate(
 		goto out_unlock;
 
 	/* Change file size if needed */
-	if (new_size) {
+	if (new_size ||  mode & FALLOC_FL_COLLAPSE_RANGE) {
 		struct iattr iattr;
 
 		iattr.ia_valid = ATTR_SIZE;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 1edb5cc..99f5244 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -516,6 +516,12 @@ typedef struct xfs_swapext
 #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
 #define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
 #define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_eofblocks)
+/*
+ * Although there is no ioctl implemented yet, we reserve an ioctl number for
+ * representing collapse range operation to avoid any possible collision in
+ * switch case of xfs_change_file_space.
+ */
+#define XFS_COLLAPSE_RANGE	_IOW('X', 59, struct xfs_flock64)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html