From: Xiubo Li <xiubli@xxxxxxxxxx> If the direct io couldn't be submit in a single request, for multiple writers, they may overlap each other. For example, with the file layout: ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304 fd = open(, O_DIRECT | O_WRONLY, ); Writer1: posix_memalign(&buffer, 4194304, SIZE); memset(buffer, 'T', SIZE); write(fd, buffer, SIZE); Writer2: posix_memalign(&buffer, 4194304, SIZE); memset(buffer, 'X', SIZE); write(fd, buffer, SIZE); >From the test result, the data in the file possiblly will be: TTT...TTT <---> object1 XXX...XXX <---> object2 The expected result should be all "XX.." or "TT.." in both object1 and object2. Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> --- fs/ceph/file.c | 38 +++++++++++++++++++++++++++++++++++--- fs/ceph/inode.c | 2 ++ fs/ceph/super.h | 3 +++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 1cedba452a66..2741070a58a9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -961,6 +961,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; bool should_dirty = !write && iter_is_iovec(iter); + bool shared_lock = false; + u64 size; if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -977,14 +979,27 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, dout("invalidate_inode_pages2_range returned %d\n", ret2); flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; + + /* + * If we cannot submit the whole iter in a single request we + * should block all the requests followed to avoid the data + * being overlapped by each other. + * + * But for those which could be submit in an single request + * they could excute in parallel. + * + * Hold the exclusive lock first. + */ + down_write(&ci->i_direct_rwsem); } else { flags = CEPH_OSD_FLAG_READ; } while (iov_iter_count(iter) > 0) { - u64 size = iov_iter_count(iter); ssize_t len; + size = iov_iter_count(iter); + if (write) size = min_t(u64, size, fsc->mount_options->wsize); else @@ -1011,9 +1026,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ret = len; break; } + if (len != size) osd_req_op_extent_update(req, 0, len); + if (write && pos == iocb->ki_pos && len == count) { + /* Switch to shared lock */ + downgrade_write(&ci->i_direct_rwsem); + shared_lock = true; + } + /* * To simplify error handling, allow AIO when IO within i_size * or IO can be satisfied by single OSD request. @@ -1110,7 +1132,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (aio_req->num_reqs == 0) { kfree(aio_req); - return ret; + goto unlock; } ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : @@ -1131,13 +1153,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ceph_aio_complete_req(req); } } - return -EIOCBQUEUED; + ret = -EIOCBQUEUED; + goto unlock; } if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { ret = pos - iocb->ki_pos; iocb->ki_pos = pos; } + +unlock: + if (write) { + if (shared_lock) + up_read(&ci->i_direct_rwsem); + else + up_write(&ci->i_direct_rwsem); + } + return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index aee7a24bf1bc..e5d634acd273 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -518,6 +518,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); + init_rwsem(&ci->i_direct_rwsem); + ci->i_meta_err = 0; return &ci->vfs_inode; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ee81920bb1a4..213c11bf41be 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -409,6 +409,9 @@ struct ceph_inode_info { struct fscache_cookie *fscache; u32 i_fscache_gen; #endif + + struct rw_semaphore i_direct_rwsem; + errseq_t i_meta_err; struct inode vfs_inode; /* at end */ -- 2.21.0