On Thu, Feb 28, 2013 at 10:46 PM, Yan, Zheng <zheng.z.yan@xxxxxxxxx> wrote: > From: "Yan, Zheng" <zheng.z.yan@xxxxxxxxx> > > ceph_aio_write() has an optimization that marks CEPH_CAP_FILE_WR > cap dirty before data is copied to page cache and inode size is > updated. The optimization avoids slow cap revocation caused by > balance_dirty_pages(), but introduces inode size update race. If > ceph_check_caps() flushes the dirty cap before the inode size is > updated, MDS can miss the new inode size. So just remove the > optimization. > > Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx> > --- > fs/ceph/file.c | 42 +++++++++++++++++------------------------- > 1 file changed, 17 insertions(+), 25 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index a949805..28ef273 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -724,9 +724,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, > if (ceph_snap(inode) != CEPH_NOSNAP) > return -EROFS; > > + sb_start_write(inode->i_sb); > retry_snap: > - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) > - return -ENOSPC; > + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { > + ret = -ENOSPC; > + goto out; > + } > __ceph_do_pending_vmtruncate(inode); > dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", > inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, > @@ -750,29 +753,10 @@ retry_snap: > ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, > &iocb->ki_pos); > } else { > - /* > - * buffered write; drop Fw early to avoid slow > - * revocation if we get stuck on balance_dirty_pages > - */ > - int dirty; > - > - spin_lock(&ci->i_ceph_lock); > - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); > - spin_unlock(&ci->i_ceph_lock); > - ceph_put_cap_refs(ci, got); > - > - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); > - if ((ret >= 0 || ret == -EIOCBQUEUED) && > - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) > - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { > - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); > - if (err < 0) > - ret = err; > - } > - > - if (dirty) > - __mark_inode_dirty(inode, dirty); > - goto out; > + mutex_lock(&inode->i_mutex); > + ret = __generic_file_aio_write(iocb, iov, nr_segs, > + &iocb->ki_pos); > + mutex_unlock(&inode->i_mutex); Hmm, you're here passing in a different value than the removed generic_file_aio_write() call did — &iocb->ki_pos instead of pos. Everything else is using the pos parameter so I rather expect that should still be used here? Also a quick skim of the interfaces makes me think that the two versions aren't interchangeable — __generic_file_aio_write() also handles O_SYNC in addition to grabbing i_mutex. Why'd you switch them? (I haven't checked deep enough to see if the unlocked version is correct or not, but it does say that's what most filesystems should use.) -Greg > } > > if (ret >= 0) { > @@ -790,12 +774,20 @@ out_put: > ceph_cap_string(got)); > ceph_put_cap_refs(ci, got); > > + if (ret >= 0 && > + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || > + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { > + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); > + if (err < 0) > + ret = err; > + } > out: > if (ret == -EOLDSNAPC) { > dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", > inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); > goto retry_snap; > } > + sb_end_write(inode->i_sb); > > return ret; > } > -- > 1.7.11.7 > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html