[RFC PATCH] ceph: serialize the direct writes when couldn't submit in a single req

xiubli@xxxxxxxxxx · Mon, 3 Feb 2020 20:54:45 -0500

From: Xiubo Li <xiubli@xxxxxxxxxx>

If the direct io couldn't be submit in a single request, for multiple
writers, they may overlap each other.

For example, with the file layout:
ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304

fd = open(, O_DIRECT | O_WRONLY, );

Writer1:
posix_memalign(&buffer, 4194304, SIZE);
memset(buffer, 'T', SIZE);
write(fd, buffer, SIZE);

Writer2:
posix_memalign(&buffer, 4194304, SIZE);
memset(buffer, 'X', SIZE);
write(fd, buffer, SIZE);

>From the test result, the data in the file possiblly will be:
TTT...TTT <---> object1
XXX...XXX <---> object2

The expected result should be all "XX.." or "TT.." in both object1
and object2.

Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
---
 fs/ceph/file.c  | 38 +++++++++++++++++++++++++++++++++++---
 fs/ceph/inode.c |  2 ++
 fs/ceph/super.h |  3 +++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1cedba452a66..2741070a58a9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -961,6 +961,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos;
 	bool write = iov_iter_rw(iter) == WRITE;
 	bool should_dirty = !write && iter_is_iovec(iter);
+	bool shared_lock = false;
+	u64 size;
 
 	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
@@ -977,14 +979,27 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
 		flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
+
+		/*
+		 * If we cannot submit the whole iter in a single request we
+		 * should block all the requests followed to avoid the data
+		 * being overlapped by each other.
+		 *
+		 * But for those which could be submit in an single request
+		 * they could excute in parallel.
+		 *
+		 * Hold the exclusive lock first.
+		 */
+		down_write(&ci->i_direct_rwsem);
 	} else {
 		flags = CEPH_OSD_FLAG_READ;
 	}
 
 	while (iov_iter_count(iter) > 0) {
-		u64 size = iov_iter_count(iter);
 		ssize_t len;
 
+		size = iov_iter_count(iter);
+
 		if (write)
 			size = min_t(u64, size, fsc->mount_options->wsize);
 		else
@@ -1011,9 +1026,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			ret = len;
 			break;
 		}
+
 		if (len != size)
 			osd_req_op_extent_update(req, 0, len);
 
+		if (write && pos == iocb->ki_pos && len == count) {
+			/* Switch to shared lock */
+			downgrade_write(&ci->i_direct_rwsem);
+			shared_lock = true;
+		}
+
 		/*
 		 * To simplify error handling, allow AIO when IO within i_size
 		 * or IO can be satisfied by single OSD request.
@@ -1110,7 +1132,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
 		if (aio_req->num_reqs == 0) {
 			kfree(aio_req);
-			return ret;
+			goto unlock;
 		}
 
 		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
@@ -1131,13 +1153,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 				ceph_aio_complete_req(req);
 			}
 		}
-		return -EIOCBQUEUED;
+		ret = -EIOCBQUEUED;
+		goto unlock;
 	}
 
 	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
 		ret = pos - iocb->ki_pos;
 		iocb->ki_pos = pos;
 	}
+
+unlock:
+	if (write) {
+		if (shared_lock)
+			up_read(&ci->i_direct_rwsem);
+		else
+			up_write(&ci->i_direct_rwsem);
+	}
+
 	return ret;
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index aee7a24bf1bc..e5d634acd273 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -518,6 +518,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	ceph_fscache_inode_init(ci);
 
+	init_rwsem(&ci->i_direct_rwsem);
+
 	ci->i_meta_err = 0;
 
 	return &ci->vfs_inode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ee81920bb1a4..213c11bf41be 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -409,6 +409,9 @@ struct ceph_inode_info {
 	struct fscache_cookie *fscache;
 	u32 i_fscache_gen;
 #endif
+
+	struct rw_semaphore i_direct_rwsem;
+
 	errseq_t i_meta_err;
 
 	struct inode vfs_inode; /* at end */
-- 
2.21.0