On Wed, Jun 5, 2019 at 11:01 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Wed, 2019-06-05 at 22:27 +0800, Yan, Zheng wrote: > > Use errseq_t to track and report errors of async metadata operations, > > similar to how kernel handles errors during writeback. > > > > If any dirty caps or any unsafe request gets dropped during session > > eviction, record -EIO in corresponding inode's i_meta_err. The error > > will be reported by subsequent fsync, > > > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > > --- > > fs/ceph/caps.c | 16 ++++++++++++++-- > > fs/ceph/file.c | 6 ++++-- > > fs/ceph/inode.c | 2 ++ > > fs/ceph/mds_client.c | 38 +++++++++++++++++++++++++------------- > > fs/ceph/super.h | 4 ++++ > > 5 files changed, 49 insertions(+), 17 deletions(-) > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > > index 50409d9fdc90..fd9ab97c7f4e 100644 > > --- a/fs/ceph/caps.c > > +++ b/fs/ceph/caps.c > > @@ -2244,6 +2244,7 @@ static int unsafe_request_wait(struct inode *inode) > > > > int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) > > { > > + struct ceph_file_info *fi = file->private_data; > > struct inode *inode = file->f_mapping->host; > > struct ceph_inode_info *ci = ceph_inode(inode); > > u64 flush_tid; > > @@ -2253,11 +2254,11 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) > > dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); > > > > ret = file_write_and_wait_range(file, start, end); > > - if (ret < 0) > > - goto out; > > > > if (datasync) > > goto out; > > + if (ret < 0) > > + goto check_meta_err; > > > > dirty = try_flush_caps(inode, &flush_tid); > > dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); > > @@ -2273,6 +2274,17 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) > > ret = wait_event_interruptible(ci->i_cap_wq, > > caps_are_flushed(inode, flush_tid)); > > } > > + > > +check_meta_err: > > + if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { > > + int err; > > + spin_lock(&file->f_lock); > > + err = errseq_check_and_advance(&ci->i_meta_err, > > + &fi->meta_err); > > + spin_unlock(&file->f_lock); > > + if (err) > > + ret = err; > > + } > > Do we care which error takes precedence in the event that ret is non- > zero here? I tend to think not, but it may be worth a comment there that > i_meta_err taking precedence over i_wb_err is arbitrary and that we may > need to revisit that at some point in the future. > > > out: > > dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); > > return ret; > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > index a7080783fe20..2fe8ca7805f4 100644 > > --- a/fs/ceph/file.c > > +++ b/fs/ceph/file.c > > @@ -200,6 +200,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) > > static int ceph_init_file_info(struct inode *inode, struct file *file, > > int fmode, bool isdir) > > { > > + struct ceph_inode_info *ci = ceph_inode(inode); > > struct ceph_file_info *fi; > > > > dout("%s %p %p 0%o (%s)\n", __func__, inode, file, > > @@ -210,7 +211,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, > > struct ceph_dir_file_info *dfi = > > kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); > > if (!dfi) { > > - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ > > + ceph_put_fmode(ci, fmode); /* clean up */ > > return -ENOMEM; > > } > > > > @@ -221,7 +222,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, > > } else { > > fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); > > if (!fi) { > > - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ > > + ceph_put_fmode(ci, fmode); /* clean up */ > > return -ENOMEM; > > } > > > > @@ -231,6 +232,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, > > fi->fmode = fmode; > > spin_lock_init(&fi->rw_contexts_lock); > > INIT_LIST_HEAD(&fi->rw_contexts); > > + fi->meta_err = errseq_sample(&ci->i_meta_err); > > > > return 0; > > } > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > > index 6003187dd39e..8c555734f8d5 100644 > > --- a/fs/ceph/inode.c > > +++ b/fs/ceph/inode.c > > @@ -512,6 +512,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) > > > > ceph_fscache_inode_init(ci); > > > > + ci->i_meta_err = 0; > > + > > return &ci->vfs_inode; > > } > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > > index c0a15e723f11..f2be9c74c3ae 100644 > > --- a/fs/ceph/mds_client.c > > +++ b/fs/ceph/mds_client.c > > @@ -1264,6 +1264,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, > > { > > struct ceph_mds_request *req; > > struct rb_node *p; > > + struct ceph_inode_info *ci; > > > > dout("cleanup_session_requests mds%d\n", session->s_mds); > > mutex_lock(&mdsc->mutex); > > @@ -1272,6 +1273,14 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, > > struct ceph_mds_request, r_unsafe_item); > > pr_warn_ratelimited(" dropping unsafe request %llu\n", > > req->r_tid); > > + if (req->r_target_inode) { > > + ci = ceph_inode(req->r_target_inode); > > + errseq_set(&ci->i_meta_err, -EIO); > > + } > > + if (req->r_unsafe_dir) { > > + ci = ceph_inode(req->r_unsafe_dir); > > + errseq_set(&ci->i_meta_err, -EIO); > > + } > > Do we really want to set this on both inodes? For operations that create new file/dir, r_target_inode is new inode, r_unsafe_dir is parent inode. I think fsync on either parent inode or the new inode should ensure the new file/dir is persistent. So I set error on both inode. > > When we talk about async metadata operations here, we're really talking > about operations that change a directory's namespace. When those fail, > it's somewhat analogous to the situation on a blockdev-based filesystem > when writeback of an in-memory directory fails. > why only namespace operations? In my opinion, marking caps dirty is async, unsafe request is partial async. > To quote the fsync(2) manpage: > > Calling fsync() does not necessarily ensure that the > entry in the directory containing the file has also > reached disk. For that an explicit fsync() on a file > descriptor for the directory is also needed. > > So if I'm doing namespace operations in a directory on a local fs, and > issue an fsync on that dir that fails, then I will probably need to > assume that previous namespace operations may have failed. > > It's unlikely though that that filesystem would record a writeback error > in the dentry->d_inode that was the target of (e.g.) an unlink in that > case. r_target_inode is null for unsafe unlink. r_target_inode is non-null if unsafe replay contains a positive dentry trace. > > That's the behavior I think we'd want to shoot for here, so I'd just > record the error in the r_unsafe_dir inode. unsafe setattr only has r_target_inode, it has null r_unsafe_dir. how to handle it? > > > __unregister_request(mdsc, req); > > } > > /* zero r_attempts, so kick_requests() will re-send requests */ > > @@ -1364,7 +1373,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, > > struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; > > struct ceph_inode_info *ci = ceph_inode(inode); > > LIST_HEAD(to_remove); > > - bool drop = false; > > + bool dirty_dropped = false; > > bool invalidate = false; > > > > dout("removing cap %p, ci is %p, inode is %p\n", > > @@ -1402,7 +1411,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, > > inode, ceph_ino(inode)); > > ci->i_dirty_caps = 0; > > list_del_init(&ci->i_dirty_item); > > - drop = true; > > + dirty_dropped = true; > > } > > if (!list_empty(&ci->i_flushing_item)) { > > pr_warn_ratelimited( > > @@ -1412,10 +1421,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, > > ci->i_flushing_caps = 0; > > list_del_init(&ci->i_flushing_item); > > mdsc->num_cap_flushing--; > > - drop = true; > > + dirty_dropped = true; > > } > > spin_unlock(&mdsc->cap_dirty_lock); > > > > + if (dirty_dropped) { > > + errseq_set(&ci->i_meta_err, -EIO); > > + > > + if (ci->i_wrbuffer_ref_head == 0 && > > + ci->i_wr_ref == 0 && > > + ci->i_dirty_caps == 0 && > > + ci->i_flushing_caps == 0) { > > + ceph_put_snap_context(ci->i_head_snapc); > > + ci->i_head_snapc = NULL; > > + } > > + } > > + > > if (atomic_read(&ci->i_filelock_ref) > 0) { > > /* make further file lock syscall return -EIO */ > > ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; > > @@ -1427,15 +1448,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, > > list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); > > ci->i_prealloc_cap_flush = NULL; > > } > > - > > - if (drop && > > - ci->i_wrbuffer_ref_head == 0 && > > - ci->i_wr_ref == 0 && > > - ci->i_dirty_caps == 0 && > > - ci->i_flushing_caps == 0) { > > - ceph_put_snap_context(ci->i_head_snapc); > > - ci->i_head_snapc = NULL; > > - } > > } > > spin_unlock(&ci->i_ceph_lock); > > while (!list_empty(&to_remove)) { > > @@ -1449,7 +1461,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, > > wake_up_all(&ci->i_cap_wq); > > if (invalidate) > > ceph_queue_invalidate(inode); > > - if (drop) > > + if (dirty_dropped) > > iput(inode); > > return 0; > > } > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > > index 98d2bafc2ee2..2e516d47052f 100644 > > --- a/fs/ceph/super.h > > +++ b/fs/ceph/super.h > > @@ -393,6 +393,8 @@ struct ceph_inode_info { > > struct fscache_cookie *fscache; > > u32 i_fscache_gen; > > #endif > > + errseq_t i_meta_err; > > + > > struct inode vfs_inode; /* at end */ > > }; > > > > @@ -701,6 +703,8 @@ struct ceph_file_info { > > > > spinlock_t rw_contexts_lock; > > struct list_head rw_contexts; > > + > > + errseq_t meta_err; > > }; > > > > struct ceph_dir_file_info { > > >