The current code will do the uninlining but it relies on the caller to set the i_inline_version appropriately afterward. There are several potential races here. Protect against competing uninlining attempts by having the callers take the i_truncate_mutex and then have them update the version themselves before dropping it. Other callers can then re-check the i_inline_version after acquiring the mutex and if it has changed to CEPH_INLINE_NONE, they can just drop it and do nothing. Finally since we are doing a lockless check first in all cases, just move that into ceph_uninline_data as well, and have the callers call it unconditionally. Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> --- fs/ceph/addr.c | 33 ++++++++++++++++++++++++--------- fs/ceph/file.c | 18 ++++++------------ 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5f1e2b6577fb..e9700c997d12 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1541,11 +1541,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_block_sigs(&oldset); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - err = ceph_uninline_data(inode, off == 0 ? page : NULL); - if (err < 0) - goto out_free; - } + err = ceph_uninline_data(inode, off == 0 ? page : NULL); + if (err < 0) + goto out_free; if (off + PAGE_SIZE <= size) len = PAGE_SIZE; @@ -1593,7 +1591,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1656,6 +1653,10 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } } +/* + * We borrow the i_truncate_mutex to serialize callers that may be racing to + * uninline the data. + */ int ceph_uninline_data(struct inode *inode, struct page *page) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -1665,15 +1666,23 @@ int ceph_uninline_data(struct inode *inode, struct page *page) int err = 0; bool from_pagecache = false; - spin_lock(&ci->i_ceph_lock); - inline_version = ci->i_inline_version; - spin_unlock(&ci->i_ceph_lock); + /* Do a lockless check first -- paired with i_ceph_lock for changes */ + inline_version = READ_ONCE(ci->i_inline_version); dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) + return 0; + + mutex_lock(&ci->i_truncate_mutex); + + /* Double check the version after taking mutex */ + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + if (inline_version == CEPH_INLINE_NONE) goto out; if (page) { @@ -1770,11 +1779,17 @@ int ceph_uninline_data(struct inode *inode, struct page *page) err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (!err) { + spin_lock(&ci->i_ceph_lock); + inline_version = CEPH_INLINE_NONE; + spin_unlock(&ci->i_ceph_lock); + } out_put: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; out: + mutex_unlock(&ci->i_truncate_mutex); if (page) { unlock_page(page); if (from_pagecache) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7bb090fa99d3..3ff83135562c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1438,11 +1438,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_inc_iversion_raw(inode); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - err = ceph_uninline_data(inode, NULL); - if (err < 0) - goto out; - } + err = ceph_uninline_data(inode, NULL); + if (err < 0) + goto out; /* FIXME: not complete since it doesn't account for being at quota */ if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) { @@ -1513,7 +1511,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1762,11 +1759,9 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ci->i_inline_version != CEPH_INLINE_NONE) { - ret = ceph_uninline_data(inode, NULL); - if (ret < 0) - goto unlock; - } + ret = ceph_uninline_data(inode, NULL); + if (ret < 0) + goto unlock; size = i_size_read(inode); @@ -1790,7 +1785,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); -- 2.21.0