Re: [PATCH V2] ceph: track and report error of async metadata operation

"Yan, Zheng" <ukernel@xxxxxxxxx> · Thu, 6 Jun 2019 08:30:15 +0800

On Wed, Jun 5, 2019 at 11:01 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>
> On Wed, 2019-06-05 at 22:27 +0800, Yan, Zheng wrote:
> > Use errseq_t to track and report errors of async metadata operations,
> > similar to how kernel handles errors during writeback.
> >
> > If any dirty caps or any unsafe request gets dropped during session
> > eviction, record -EIO in corresponding inode's i_meta_err. The error
> > will be reported by subsequent fsync,
> >
> > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx>
> > ---
> >  fs/ceph/caps.c       | 16 ++++++++++++++--
> >  fs/ceph/file.c       |  6 ++++--
> >  fs/ceph/inode.c      |  2 ++
> >  fs/ceph/mds_client.c | 38 +++++++++++++++++++++++++-------------
> >  fs/ceph/super.h      |  4 ++++
> >  5 files changed, 49 insertions(+), 17 deletions(-)
> >
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 50409d9fdc90..fd9ab97c7f4e 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -2244,6 +2244,7 @@ static int unsafe_request_wait(struct inode *inode)
> >
> >  int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> >  {
> > +     struct ceph_file_info *fi = file->private_data;
> >       struct inode *inode = file->f_mapping->host;
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> >       u64 flush_tid;
> > @@ -2253,11 +2254,11 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> >       dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
> >
> >       ret = file_write_and_wait_range(file, start, end);
> > -     if (ret < 0)
> > -             goto out;
> >
> >       if (datasync)
> >               goto out;
> > +     if (ret < 0)
> > +             goto check_meta_err;
> >
> >       dirty = try_flush_caps(inode, &flush_tid);
> >       dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
> > @@ -2273,6 +2274,17 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> >               ret = wait_event_interruptible(ci->i_cap_wq,
> >                                       caps_are_flushed(inode, flush_tid));
> >       }
> > +
> > +check_meta_err:
> > +     if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
> > +             int err;
> > +             spin_lock(&file->f_lock);
> > +             err = errseq_check_and_advance(&ci->i_meta_err,
> > +                                            &fi->meta_err);
> > +             spin_unlock(&file->f_lock);
> > +             if (err)
> > +                     ret = err;
> > +     }
>
> Do we care which error takes precedence in the event that ret is non-
> zero here? I tend to think not, but it may be worth a comment there that
> i_meta_err taking precedence over i_wb_err is arbitrary and that we may
> need to revisit that at some point in the future.
>
> >  out:
> >       dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
> >       return ret;
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index a7080783fe20..2fe8ca7805f4 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -200,6 +200,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
> >  static int ceph_init_file_info(struct inode *inode, struct file *file,
> >                                       int fmode, bool isdir)
> >  {
> > +     struct ceph_inode_info *ci = ceph_inode(inode);
> >       struct ceph_file_info *fi;
> >
> >       dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
> > @@ -210,7 +211,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >               struct ceph_dir_file_info *dfi =
> >                       kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
> >               if (!dfi) {
> > -                     ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > +                     ceph_put_fmode(ci, fmode); /* clean up */
> >                       return -ENOMEM;
> >               }
> >
> > @@ -221,7 +222,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >       } else {
> >               fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> >               if (!fi) {
> > -                     ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > +                     ceph_put_fmode(ci, fmode); /* clean up */
> >                       return -ENOMEM;
> >               }
> >
> > @@ -231,6 +232,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> >       fi->fmode = fmode;
> >       spin_lock_init(&fi->rw_contexts_lock);
> >       INIT_LIST_HEAD(&fi->rw_contexts);
> > +     fi->meta_err = errseq_sample(&ci->i_meta_err);
> >
> >       return 0;
> >  }
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 6003187dd39e..8c555734f8d5 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -512,6 +512,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >
> >       ceph_fscache_inode_init(ci);
> >
> > +     ci->i_meta_err = 0;
> > +
> >       return &ci->vfs_inode;
> >  }
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index c0a15e723f11..f2be9c74c3ae 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -1264,6 +1264,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
> >  {
> >       struct ceph_mds_request *req;
> >       struct rb_node *p;
> > +     struct ceph_inode_info *ci;
> >
> >       dout("cleanup_session_requests mds%d\n", session->s_mds);
> >       mutex_lock(&mdsc->mutex);
> > @@ -1272,6 +1273,14 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
> >                                      struct ceph_mds_request, r_unsafe_item);
> >               pr_warn_ratelimited(" dropping unsafe request %llu\n",
> >                                   req->r_tid);
> > +             if (req->r_target_inode) {
> > +                     ci = ceph_inode(req->r_target_inode);
> > +                     errseq_set(&ci->i_meta_err, -EIO);
> > +             }
> > +             if (req->r_unsafe_dir) {
> > +                     ci = ceph_inode(req->r_unsafe_dir);
> > +                     errseq_set(&ci->i_meta_err, -EIO);
> > +             }
>
> Do we really want to set this on both inodes?

For operations that create new file/dir,  r_target_inode is new inode,
 r_unsafe_dir is parent inode. I think fsync on either parent inode or
the new inode should ensure the new file/dir is
persistent. So I set error on both inode.

>
> When we talk about async metadata operations here, we're really talking
> about operations that change a directory's namespace. When those fail,
> it's somewhat analogous to the situation on a blockdev-based filesystem
> when writeback of an in-memory directory fails.
>

why only namespace operations? In my opinion, marking caps dirty is
async, unsafe request is partial async.

> To quote the fsync(2) manpage:
>
>        Calling  fsync() does not necessarily ensure that the
>        entry in the directory containing the file  has  also
>        reached disk.  For that an explicit fsync() on a file
>        descriptor for the directory is also needed.
>
> So if I'm doing namespace operations in a directory on a local fs, and
> issue an fsync on that dir that fails, then I will probably need to
> assume that previous namespace operations may have failed.
>
> It's unlikely though that that filesystem would record a writeback error
> in the dentry->d_inode that was the target of (e.g.) an unlink in that
> case.

r_target_inode is null for unsafe unlink.  r_target_inode is non-null
if unsafe replay contains a positive dentry trace.

>
> That's the behavior I think we'd want to shoot for here, so I'd just
> record the error in the r_unsafe_dir inode.

unsafe setattr only has r_target_inode,  it has null r_unsafe_dir. how
to handle it?

>
> >               __unregister_request(mdsc, req);
> >       }
> >       /* zero r_attempts, so kick_requests() will re-send requests */
> > @@ -1364,7 +1373,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >       struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
> >       struct ceph_inode_info *ci = ceph_inode(inode);
> >       LIST_HEAD(to_remove);
> > -     bool drop = false;
> > +     bool dirty_dropped = false;
> >       bool invalidate = false;
> >
> >       dout("removing cap %p, ci is %p, inode is %p\n",
> > @@ -1402,7 +1411,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >                               inode, ceph_ino(inode));
> >                       ci->i_dirty_caps = 0;
> >                       list_del_init(&ci->i_dirty_item);
> > -                     drop = true;
> > +                     dirty_dropped = true;
> >               }
> >               if (!list_empty(&ci->i_flushing_item)) {
> >                       pr_warn_ratelimited(
> > @@ -1412,10 +1421,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >                       ci->i_flushing_caps = 0;
> >                       list_del_init(&ci->i_flushing_item);
> >                       mdsc->num_cap_flushing--;
> > -                     drop = true;
> > +                     dirty_dropped = true;
> >               }
> >               spin_unlock(&mdsc->cap_dirty_lock);
> >
> > +             if (dirty_dropped) {
> > +                     errseq_set(&ci->i_meta_err, -EIO);
> > +
> > +                     if (ci->i_wrbuffer_ref_head == 0 &&
> > +                         ci->i_wr_ref == 0 &&
> > +                         ci->i_dirty_caps == 0 &&
> > +                         ci->i_flushing_caps == 0) {
> > +                             ceph_put_snap_context(ci->i_head_snapc);
> > +                             ci->i_head_snapc = NULL;
> > +                     }
> > +             }
> > +
> >               if (atomic_read(&ci->i_filelock_ref) > 0) {
> >                       /* make further file lock syscall return -EIO */
> >                       ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
> > @@ -1427,15 +1448,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >                       list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
> >                       ci->i_prealloc_cap_flush = NULL;
> >               }
> > -
> > -               if (drop &&
> > -                  ci->i_wrbuffer_ref_head == 0 &&
> > -                  ci->i_wr_ref == 0 &&
> > -                  ci->i_dirty_caps == 0 &&
> > -                  ci->i_flushing_caps == 0) {
> > -                      ceph_put_snap_context(ci->i_head_snapc);
> > -                      ci->i_head_snapc = NULL;
> > -               }
> >       }
> >       spin_unlock(&ci->i_ceph_lock);
> >       while (!list_empty(&to_remove)) {
> > @@ -1449,7 +1461,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >       wake_up_all(&ci->i_cap_wq);
> >       if (invalidate)
> >               ceph_queue_invalidate(inode);
> > -     if (drop)
> > +     if (dirty_dropped)
> >               iput(inode);
> >       return 0;
> >  }
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 98d2bafc2ee2..2e516d47052f 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -393,6 +393,8 @@ struct ceph_inode_info {
> >       struct fscache_cookie *fscache;
> >       u32 i_fscache_gen;
> >  #endif
> > +     errseq_t i_meta_err;
> > +
> >       struct inode vfs_inode; /* at end */
> >  };
> >
> > @@ -701,6 +703,8 @@ struct ceph_file_info {
> >
> >       spinlock_t rw_contexts_lock;
> >       struct list_head rw_contexts;
> > +
> > +     errseq_t meta_err;
> >  };
> >
> >  struct ceph_dir_file_info {
>
>
>