Re: [PATCH] ceph: encode inodes' parent/d_name in cap reconnect message

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Aug 11, 2020 at 7:31 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>
> On Tue, 2020-08-11 at 15:23 +0800, Yan, Zheng wrote:
> > Since nautilus, MDS tracks dirfrags whose child inodes have caps in open
> > file table. When MDS recovers, it prefetches all of these dirfrags. This
> > avoids using backtrace to load inodes. But dirfrags prefetch may load
> > lots of useless inodes into cache, and make MDS run out of memory.
> >
> > Recent MDS adds an option that disables dirfrags prefetch. When dirfrags
> > prefetch is disabled. Recovering MDS only prefetches corresponding dir
> > inodes. Including inodes' parent/d_name in cap reconnect message can
> > help MDS to load inodes into its cache.
> >
> > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx>
> > ---
> >  fs/ceph/mds_client.c | 89 ++++++++++++++++++++++++++++++--------------
> >  1 file changed, 61 insertions(+), 28 deletions(-)
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 9a09d12569bd..4eaed12b4b4c 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -3553,6 +3553,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
> >       return err;
> >  }
> >
> > +static struct dentry* d_find_primary(struct inode *inode)
> > +{
> > +     struct dentry *alias, *dn = NULL;
> > +
> > +     if (hlist_empty(&inode->i_dentry))
> > +             return NULL;
> > +
> > +     spin_lock(&inode->i_lock);
> > +     if (hlist_empty(&inode->i_dentry))
> > +             goto out_unlock;
> > +
> > +     if (S_ISDIR(inode->i_mode)) {
> > +             alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
> > +             if (!IS_ROOT(alias))
> > +                     dn = dget(alias);
> > +             goto out_unlock;
> > +     }
> > +
> > +     hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
> > +             spin_lock(&alias->d_lock);
> > +             if (!d_unhashed(alias) &&
> > +                 (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
> > +                     dn = dget_dlock(alias);
> > +             }
> > +             spin_unlock(&alias->d_lock);
> > +             if (dn)
> > +                     break;
> > +     }
> > +out_unlock:
> > +     spin_unlock(&inode->i_lock);
> > +     return dn;
> > +}
> > +
> >  /*
> >   * Encode information about a cap for a reconnect with the MDS.
> >   */
> > @@ -3566,13 +3599,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >       struct ceph_inode_info *ci = cap->ci;
> >       struct ceph_reconnect_state *recon_state = arg;
> >       struct ceph_pagelist *pagelist = recon_state->pagelist;
> > -     int err;
> > +     struct dentry *dentry;
> > +     char *path;
> > +     int pathlen, err;
> > +     u64 pathbase;
> >       u64 snap_follows;
> >
> >       dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
> >            inode, ceph_vinop(inode), cap, cap->cap_id,
> >            ceph_cap_string(cap->issued));
> >
> > +     dentry = d_find_primary(inode);
> > +     if (dentry) {
> > +             /* set pathbase to parent dir when msg_version >= 2 */
> > +             path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
> > +                                         recon_state->msg_version >= 2);
>
> One question:
>
> Do we really need to build a full path back to the root for the
> msg_version == 1 case? I notice that the v1 message has a field for the
> pathbase, which would seem to make the full path unnecessary. Is there
> some quirk in older MDS versions that requires a full path for this?
>

emperor and older mds require this. I guess no one uses mds that old.
So  it's OK to always build relative path.

Regards
Yan, Zheng

>
> > +             dput(dentry);
> > +             if (IS_ERR(path)) {
> > +                     err = PTR_ERR(path);
> > +                     goto out_err;
> > +             }
> > +     } else {
> > +             path = NULL;
> > +             pathlen = 0;
> > +             pathbase = 0;
> > +     }
> > +
> >       spin_lock(&ci->i_ceph_lock);
> >       cap->seq = 0;        /* reset cap seq */
> >       cap->issue_seq = 0;  /* and issue_seq */
> > @@ -3593,7 +3645,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >               rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
> >               rec.v2.issued = cpu_to_le32(cap->issued);
> >               rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> > -             rec.v2.pathbase = 0;
> > +             rec.v2.pathbase = cpu_to_le64(pathbase);
> >               rec.v2.flock_len = (__force __le32)
> >                       ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
> >       } else {
> > @@ -3604,7 +3656,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >               ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
> >               ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
> >               rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> > -             rec.v1.pathbase = 0;
> > +             rec.v1.pathbase = cpu_to_le64(pathbase);
> >       }
> >
> >       if (list_empty(&ci->i_cap_snaps)) {
> > @@ -3666,7 +3718,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >                           sizeof(struct ceph_filelock);
> >               rec.v2.flock_len = cpu_to_le32(struct_len);
> >
> > -             struct_len += sizeof(u32) + sizeof(rec.v2);
> > +             struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
> >
> >               if (struct_v >= 2)
> >                       struct_len += sizeof(u64); /* snap_follows */
> > @@ -3690,7 +3742,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >                       ceph_pagelist_encode_8(pagelist, 1);
> >                       ceph_pagelist_encode_32(pagelist, struct_len);
> >               }
> > -             ceph_pagelist_encode_string(pagelist, NULL, 0);
> > +             ceph_pagelist_encode_string(pagelist, path, pathlen);
> >               ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
> >               ceph_locks_to_pagelist(flocks, pagelist,
> >                                      num_fcntl_locks, num_flock_locks);
> > @@ -3699,39 +3751,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >  out_freeflocks:
> >               kfree(flocks);
> >       } else {
> > -             u64 pathbase = 0;
> > -             int pathlen = 0;
> > -             char *path = NULL;
> > -             struct dentry *dentry;
> > -
> > -             dentry = d_find_alias(inode);
> > -             if (dentry) {
> > -                     path = ceph_mdsc_build_path(dentry,
> > -                                             &pathlen, &pathbase, 0);
> > -                     dput(dentry);
> > -                     if (IS_ERR(path)) {
> > -                             err = PTR_ERR(path);
> > -                             goto out_err;
> > -                     }
> > -                     rec.v1.pathbase = cpu_to_le64(pathbase);
> > -             }
> > -
> >               err = ceph_pagelist_reserve(pagelist,
> >                                           sizeof(u64) + sizeof(u32) +
> >                                           pathlen + sizeof(rec.v1));
> > -             if (err) {
> > -                     goto out_freepath;
> > -             }
> > +             if (err)
> > +                     goto out_err;
> >
> >               ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
> >               ceph_pagelist_encode_string(pagelist, path, pathlen);
> >               ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
> > -out_freepath:
> > -             ceph_mdsc_free_path(path, pathlen);
> >       }
> >
> >  out_err:
> > -     if (err >= 0)
> > +     ceph_mdsc_free_path(path, pathlen);
> > +     if (!err)
> >               recon_state->nr_caps++;
> >       return err;
> >  }
>
> --
> Jeff Layton <jlayton@xxxxxxxxxx>
>



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux