[PATCH 06/14] xfs: implement copy-on-write for reflinked blocks

"Darrick J. Wong" <darrick.wong@xxxxxxxxxx> · Thu, 25 Jun 2015 16:39:50 -0700

Implement a copy-on-write handler for the buffered write path.  When
writepages is called, allocate a new block (which we then tell the log
that we intend to delete so that it's freed if we crash), and then
write the buffer to the new block.  Upon completion, remove the freed
block intent from the log and remap the file so that the changes
appear.

Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
---
 fs/xfs/xfs_aops.c    |   38 +++++-
 fs/xfs/xfs_aops.h    |    5 +
 fs/xfs/xfs_reflink.c |  340 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_reflink.h |   15 ++
 4 files changed, 393 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index dc52698..be57e5d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,8 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
+#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
@@ -190,7 +192,8 @@ xfs_finish_ioend(
 	if (atomic_dec_and_test(&ioend->io_remaining)) {
 		struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 
-		if (ioend->io_type == XFS_IO_UNWRITTEN)
+		if (ioend->io_type == XFS_IO_UNWRITTEN ||
+		    ioend->io_type == XFS_IO_FORKED)
 			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 		else if (ioend->io_append_trans)
 			queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -218,6 +221,19 @@ xfs_end_io(
 		goto done;
 
 	/*
+	 * If we forked the block, we need to remap the bmbt and possibly
+	 * finish up the i_size transaction too.
+	 */
+	if (ioend->io_type == XFS_IO_FORKED) {
+		error = xfs_reflink_end_io(ip->i_mount, ip, ioend);
+		if (error)
+			goto done;
+		if (ioend->io_append_trans)
+			error = xfs_setfilesize_ioend(ioend);
+		goto done;
+	}
+
+	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 */
@@ -268,6 +284,7 @@ xfs_alloc_ioend(
 	ioend->io_append_trans = NULL;
 
 	INIT_WORK(&ioend->io_work, xfs_end_io);
+	INIT_LIST_HEAD(&ioend->io_reflink_endio_list);
 	return ioend;
 }
 
@@ -567,7 +584,8 @@ xfs_add_to_ioend(
 	xfs_off_t		offset,
 	unsigned int		type,
 	xfs_ioend_t		**result,
-	int			need_ioend)
+	int			need_ioend,
+	xfs_reflink_end_io_t	*eio)
 {
 	xfs_ioend_t		*ioend = *result;
 
@@ -588,6 +606,8 @@ xfs_add_to_ioend(
 
 	bh->b_private = NULL;
 	ioend->io_size += bh->b_size;
+	if (eio)
+		list_add_tail(&eio->rlei_list, &ioend->io_reflink_endio_list);
 }
 
 STATIC void
@@ -788,7 +808,7 @@ xfs_convert_page(
 			if (type != XFS_IO_OVERWRITE)
 				xfs_map_at_offset(inode, bh, imap, offset);
 			xfs_add_to_ioend(inode, bh, offset, type,
-					 ioendp, done);
+					 ioendp, done, NULL);
 
 			page_dirty--;
 			count++;
@@ -951,6 +971,7 @@ xfs_vm_writepage(
 	int			err, imap_valid = 0, uptodate = 1;
 	int			count = 0;
 	int			nonblocking = 0;
+	struct xfs_inode	*ip = XFS_I(inode);
 
 	trace_xfs_writepage(inode, page, 0, 0);
 
@@ -1119,11 +1140,17 @@ xfs_vm_writepage(
 			imap_valid = xfs_imap_valid(inode, &imap, offset);
 		}
 		if (imap_valid) {
+			xfs_reflink_end_io_t *eio = NULL;
+
+			err = xfs_reflink_fork_block(ip, &imap, offset,
+						     &type, &eio);
+			if (err)
+				goto error;
 			lock_buffer(bh);
 			if (type != XFS_IO_OVERWRITE)
 				xfs_map_at_offset(inode, bh, &imap, offset);
 			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-					 new_ioend);
+					 new_ioend, eio);
 			count++;
 		}
 
@@ -1137,6 +1164,9 @@ xfs_vm_writepage(
 
 	xfs_start_page_writeback(page, 1, count);
 
+	if (err)
+		goto error;
+
 	/* if there is no IO to be submitted for this page, we are done */
 	if (!ioend)
 		return 0;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 86afd1a..9cf206a 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -27,12 +27,14 @@ enum {
 	XFS_IO_DELALLOC,	/* covers delalloc region */
 	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
 	XFS_IO_OVERWRITE,	/* covers already allocated extent */
+	XFS_IO_FORKED,		/* covers copy-on-write region */
 };
 
 #define XFS_IO_TYPES \
 	{ XFS_IO_DELALLOC,		"delalloc" }, \
 	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
-	{ XFS_IO_OVERWRITE,		"overwrite" }
+	{ XFS_IO_OVERWRITE,		"overwrite" }, \
+	{ XFS_IO_FORKED,		"forked" }
 
 /*
  * xfs_ioend struct manages large extent writes for XFS.
@@ -50,6 +52,7 @@ typedef struct xfs_ioend {
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
 	struct xfs_trans	*io_append_trans;/* xact. for size update */
+	struct list_head	io_reflink_endio_list;/* remappings for CoW */
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ce5feeb..39b29a4 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -45,6 +45,31 @@
 #include "xfs_alloc.h"
 #include "xfs_quota_defs.h"
 #include "xfs_quota.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+
+#define CHECK_AG_NUMBER(mp, agno) \
+	do { \
+		ASSERT((agno) != NULLAGNUMBER); \
+		ASSERT((agno) < (mp)->m_sb.sb_agcount); \
+	} while(0);
+
+#define CHECK_AG_EXTENT(mp, agbno, len) \
+	do { \
+		ASSERT((agbno) != NULLAGBLOCK); \
+		ASSERT((len) > 0); \
+		ASSERT((unsigned long long)(agbno) + (len) <= \
+				(mp)->m_sb.sb_agblocks); \
+	} while(0);
+
+#define XFS_WANT_CORRUPTED_RLEXT_GOTO(mp, have, agbno, len, nr, label) \
+	do { \
+		XFS_WANT_CORRUPTED_GOTO((mp), (have) == 1, label); \
+		XFS_WANT_CORRUPTED_GOTO((mp), (len) > 0, label); \
+		XFS_WANT_CORRUPTED_GOTO((mp), (nr) >= 2, label); \
+		XFS_WANT_CORRUPTED_GOTO((mp), (unsigned long long)(agbno) + \
+				(len) <= (mp)->m_sb.sb_agblocks, label); \
+	} while(0);
 
 /**
  * xfs_reflink() - link a range of blocks from one inode to another
@@ -294,3 +319,318 @@ out_unlock_io:
 
 	return error;
 }
+
+/**
+ * xfs_reflink_get_refcount() - get refcount and extent length for a given pblk
+ *
+ * @mp: XFS mount object
+ * @agno: AG number
+ * @agbno: AG block number
+ * @len: length of extent
+ * @nr: refcount
+ */
+int
+xfs_reflink_get_refcount(
+	struct xfs_mount	*mp,		/* xfs mount object */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_agblock_t		agbno,		/* ag start of range to free */
+	xfs_extlen_t		*len,		/* out: length of extent */
+	xfs_nlink_t		*nr)		/* out: refcount */
+{
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	xfs_agblock_t		lbno;		/* rlextent start */
+	xfs_extlen_t		llen;		/* rlextent length */
+	xfs_nlink_t		lnr;		/* rlextent refcount */
+	xfs_extlen_t		aglen;
+	int			error;
+	int			i, have;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb)) {
+		*len = 0;
+		*nr = 1;
+		return 0;
+	}
+
+	CHECK_AG_NUMBER(mp, agno);
+	CHECK_AG_EXTENT(mp, agbno, 1);
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+	aglen = be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length);
+	ASSERT(agbno < aglen);
+
+	/*
+	 * See if there's an extent covering the block we want.
+	 */
+	cur = xfs_reflinkbt_init_cursor(mp, NULL, agbp, agno);
+	error = xfs_reflink_lookup_le(cur, agbno, &have);
+	if (error)
+		goto error0;
+	if (!have)
+		goto hole;
+	error = xfs_reflink_get_rec(cur, &lbno, &llen, &lnr, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_RLEXT_GOTO(mp, i, lbno, llen, lnr, error0);
+	if (lbno + llen <= agbno)
+		goto hole;
+
+	*len = llen - (agbno - lbno);
+	*nr = lnr;
+	goto out;
+hole:
+	/*
+	 * We're in a hole, so pretend that this we have a refcount=1 extent
+	 * going to the next rlextent or the end of the AG.
+	 */
+	error = xfs_btree_increment(cur, 0, &have);
+	if (error)
+		goto error0;
+	if (!have)
+		*len = aglen - agbno;
+	else {
+		error = xfs_reflink_get_rec(cur, &lbno, &llen,
+				&lnr, &i);
+		XFS_WANT_CORRUPTED_RLEXT_GOTO(mp, i, lbno, llen, lnr, error0);
+		ASSERT(lbno + llen >= agbno);
+		*len = lbno - agbno;
+	}
+	*nr = 1;
+out:
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+	return error;
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_buf_relse(agbp);
+	return error;
+}
+
+/**
+ * xfs_reflink_fork_block() - start forking a block, if reflinked
+ *
+ * @ip: XFS inode object
+ * @imap: the fileoff:fsblock mapping that we might fork
+ * @offset: the file offset of the block we're examining
+ * @type: the ioend type
+ */
+int
+xfs_reflink_fork_block(
+	struct xfs_inode	*ip,		/* xfs inode object */
+	xfs_bmbt_irec_t		*imap,		/* in/out: block mapping */
+	xfs_off_t		offset,		/* file offset */
+	unsigned int		*type,		/* in/out: what kind of io is this? */
+	xfs_reflink_end_io_t	**peio)		/* out: reflink context for end_io */
+{
+	xfs_fsblock_t		fsbno;
+	xfs_off_t		iomap_offset;
+	xfs_agnumber_t		agno;		/* allocation group number */
+	xfs_agblock_t		agbno;		/* ag start of range to free */
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_extlen_t		len;		/* rlextent length */
+	xfs_nlink_t		nr;		/* rlextent refcount */
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	xfs_reflink_end_io_t	*eio;
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+	if (*type == XFS_IO_DELALLOC || *type == XFS_IO_UNWRITTEN)
+		return 0;
+
+	iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	fsbno = imap->br_startblock + XFS_B_TO_FSB(mp, offset - iomap_offset);
+	agno = XFS_FSB_TO_AGNO(mp, fsbno);
+	agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
+	CHECK_AG_NUMBER(mp, agno);
+	CHECK_AG_EXTENT(mp, agbno, 1);
+	ASSERT(imap->br_state == XFS_EXT_NORM);
+
+	/*
+	 * See if there's an extent covering the block we want.  If so,
+	 * then this block is reflinked and must be forked.
+	 */
+	error = xfs_reflink_get_refcount(mp, agno, agbno, &len, &nr);
+	if (error)
+		return error;
+	ASSERT(len != 0);
+	if (nr < 2)
+		goto out;
+
+	/*
+	 * Ok, we have to fork this block.  First set up a transaction...
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+				  XFS_DIOSTRAT_SPACE_RES(mp, 2), 0);
+	if (error)
+		goto error0;
+
+	/*
+	 * Now allocate a block, stash the new mapping, and add an EFI entry
+	 * so the block gets cleared if we crash.
+	 *
+	 * XXX: Ideally we'd scan up and down the incore extent list
+	 * looking for a block, but do this stupid thing for now.
+	 */
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = mp;
+	args.type = XFS_ALLOCTYPE_START_BNO;
+	args.firstblock = imap->br_startblock;
+	args.fsbno = imap->br_startblock;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.userdata = XFS_ALLOC_USERDATA;
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto error0;
+	ASSERT(args.len == 1);
+
+	imap->br_startblock = args.fsbno;
+	imap->br_startoff = XFS_B_TO_FSB(mp, offset);
+	imap->br_blockcount = args.len;
+	imap->br_state = XFS_EXT_NORM;
+
+	eio = kmem_zalloc(sizeof(*eio), KM_SLEEP | KM_NOFS);
+	eio->rlei_efi = xfs_trans_get_efi(tp, 1);
+	eio->rlei_mapping = *imap;
+	xfs_trans_log_efi_extent(tp, eio->rlei_efi, imap->br_startblock,
+				 imap->br_blockcount);
+	*peio = eio;
+
+	/*
+	 * ...and we're done.
+	 */
+	*type = XFS_IO_FORKED;
+	error = xfs_trans_commit(tp);
+
+	return error;
+out:
+	return 0;
+error0:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+/**
+ * xfs_reflink_remap_after_io() - remap a range of file blocks after forking
+ *
+ * @mp: XFS mount object
+ * @ip: XFS inode object
+ * @imap: the new mapping
+ */
+STATIC int
+xfs_reflink_remap_after_io(
+	struct xfs_mount	*mp,		/* XFS mount object */
+	struct xfs_inode	*ip,		/* inode */
+	xfs_reflink_end_io_t	*eio)		/* endio data */
+{
+	struct xfs_trans	*tp = NULL;
+	int			error;
+	xfs_agnumber_t		agno;		/* allocation group number */
+	xfs_agblock_t		agbno;		/* ag start of range to free */
+	xfs_fsblock_t		firstfsb;
+	int			committed;
+	xfs_bmbt_irec_t		imaps[1];
+	int			nimaps = 1;
+	int			done;
+	xfs_bmap_free_t		free_list;
+	xfs_bmbt_irec_t		*imap = &eio->rlei_mapping;
+	struct xfs_efd_log_item	*efd;
+	unsigned int		resblks;
+
+	ASSERT(xfs_sb_version_hasreflink(&mp->m_sb));
+	agno = XFS_FSB_TO_AGNO(mp, imap->br_startblock);
+	agbno = XFS_FSB_TO_AGBNO(mp, imap->br_startblock);
+	CHECK_AG_NUMBER(mp, agno);
+	CHECK_AG_EXTENT(mp, agbno, 1);
+	ASSERT(imap->br_state == XFS_EXT_NORM);
+
+	ASSERT(!XFS_IS_REALTIME_INODE(ip));
+
+	/*
+	 * Set up a transaction -- we're munging the rlbt update, the unmap,
+	 * and the remap operation into one huge transaction.
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, imap->br_blockcount * 3);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+	if (error) {
+		xfs_trans_cancel(tp);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Remove the EFD.
+	 */
+	efd = xfs_trans_get_efd(tp, eio->rlei_efi, 1);
+	xfs_trans_log_efd_extent(tp, efd, imap->br_startblock,
+				 imap->br_blockcount);
+
+	/*
+	 * Remap the old blocks.
+	 */
+	xfs_bmap_init(&free_list, &firstfsb);
+	error = xfs_bunmapi(tp, ip, imap->br_startoff, imap->br_blockcount, 0,
+			imap->br_blockcount, &firstfsb, &free_list, &done);
+	if (error)
+		goto error2;
+
+	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
+					XFS_BMAPI_REFLINK, &imap->br_startblock,
+					0, &imaps[0], &nimaps, &free_list);
+	if (error)
+		goto error2;
+
+	/*
+	 * Finish transaction.
+	 */
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error)
+		goto error1;
+
+
+	error = xfs_trans_commit(tp);
+	return error;
+
+error2:
+	xfs_bmap_cancel(&free_list);
+error1:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+/**
+ * xfs_reflink_end_io() - remap all blocks after forking
+ *
+ * @mp: XFS mount object
+ * @ip: XFS inode object
+ * @ioend: the io completion object
+ */
+int
+xfs_reflink_end_io(
+	struct xfs_mount	*mp,		/* XFS mount object */
+	struct xfs_inode	*ip,		/* inode */
+	xfs_ioend_t		*ioend)		/* IO completion object */
+{
+	int			error, err2;
+	struct list_head	*pos, *n;
+	xfs_reflink_end_io_t	*eio;
+
+	error = 0;
+	list_for_each_safe(pos, n, &ioend->io_reflink_endio_list) {
+		eio = list_entry(pos, xfs_reflink_end_io_t, rlei_list);
+		err2 = xfs_reflink_remap_after_io(mp, ip, eio);
+		if (error == 0)
+			error = err2;
+		kfree(eio);
+	}
+
+	return error;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 7cccd50..40a6576 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -18,7 +18,22 @@
 #ifndef __XFS_REFLINK_H
 #define __XFS_REFLINK_H 1
 
+typedef struct xfs_reflink_end_io {
+	struct list_head	rlei_list;
+	xfs_bmbt_irec_t		rlei_mapping;
+	struct xfs_efi_log_item	*rlei_efi;
+} xfs_reflink_end_io_t;
+
 extern int xfs_reflink(struct xfs_inode *src, xfs_off_t srcoff,
 	struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len);
 
+extern int xfs_reflink_get_refcount(struct xfs_mount *mp, xfs_agnumber_t agno,
+	xfs_agblock_t agbno, xfs_extlen_t *len, xfs_nlink_t *nr);
+
+extern int xfs_reflink_fork_block(struct xfs_inode *ip, xfs_bmbt_irec_t *imap,
+	xfs_off_t offset, unsigned int *type, xfs_reflink_end_io_t **peio);
+
+extern int xfs_reflink_end_io(struct xfs_mount *mp, struct xfs_inode *ip,
+	xfs_ioend_t *ioend);
+
 #endif /* __XFS_REFLINK_H */

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs