On 03/05/2013 02:26 AM, Gregory Farnum wrote: > On Thu, Feb 28, 2013 at 10:46 PM, Yan, Zheng <zheng.z.yan@xxxxxxxxx> wrote: >> From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx> >> >> ceph_aio_write() has an optimization that marks CEPH_CAP_FILE_WR >> cap dirty before data is copied to page cache and inode size is >> updated. The optimization avoids slow cap revocation caused by >> balance_dirty_pages(), but introduces inode size update race. If >> ceph_check_caps() flushes the dirty cap before the inode size is >> updated, MDS can miss the new inode size. So just remove the >> optimization. >> >> Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx> >> --- >> fs/ceph/file.c | 42 +++++++++++++++++------------------------- >> 1 file changed, 17 insertions(+), 25 deletions(-) >> >> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >> index a949805..28ef273 100644 >> --- a/fs/ceph/file.c >> +++ b/fs/ceph/file.c >> @@ -724,9 +724,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, >> if (ceph_snap(inode) != CEPH_NOSNAP) >> return -EROFS; >> >> + sb_start_write(inode->i_sb); >> retry_snap: >> - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) >> - return -ENOSPC; >> + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { >> + ret = -ENOSPC; >> + goto out; >> + } >> __ceph_do_pending_vmtruncate(inode); >> dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", >> inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, >> @@ -750,29 +753,10 @@ retry_snap: >> ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, >> &iocb->ki_pos); >> } else { >> - /* >> - * buffered write; drop Fw early to avoid slow >> - * revocation if we get stuck on balance_dirty_pages >> - */ >> - int dirty; >> - >> - spin_lock(&ci->i_ceph_lock); >> - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); >> - spin_unlock(&ci->i_ceph_lock); >> - ceph_put_cap_refs(ci, got); >> - >> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); >> - if ((ret >= 0 || ret == -EIOCBQUEUED) && >> - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) >> - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { >> - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); >> - if (err < 0) >> - ret = err; >> - } >> - >> - if (dirty) >> - __mark_inode_dirty(inode, dirty); >> - goto out; >> + mutex_lock(&inode->i_mutex); >> + ret = __generic_file_aio_write(iocb, iov, nr_segs, >> + &iocb->ki_pos); >> + mutex_unlock(&inode->i_mutex); > > Hmm, you're here passing in a different value than the removed > generic_file_aio_write() call did — &iocb->ki_pos instead of pos. > Everything else is using the pos parameter so I rather expect that > should still be used here? They always have the same value, see the BUG_ON in generic_file_aio_write() > Also a quick skim of the interfaces makes me think that the two > versions aren't interchangeable — __generic_file_aio_write() also > handles O_SYNC in addition to grabbing i_mutex. Why'd you switch them? ceph has its own code that handles O_SYNC case. I want to make sb_start_write() covers ceph_sync_write(), that's the reason I use __generic_file_aio_write() here. Regards Yan, Zheng > (I haven't checked deep enough to see if the unlocked version is > correct or not, but it does say that's what most filesystems should > use.) > -Greg > > >> } >> >> if (ret >= 0) { >> @@ -790,12 +774,20 @@ out_put: >> ceph_cap_string(got)); >> ceph_put_cap_refs(ci, got); >> >> + if (ret >= 0 && >> + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || >> + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { >> + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); >> + if (err < 0) >> + ret = err; >> + } >> out: >> if (ret == -EOLDSNAPC) { >> dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", >> inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); >> goto retry_snap; >> } >> + sb_end_write(inode->i_sb); >> >> return ret; >> } >> -- >> 1.7.11.7 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in >> the body of a message to majordomo@xxxxxxxxxxxxxxx >> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html