On Tue, 2020-08-11 at 15:23 +0800, Yan, Zheng wrote: > Since nautilus, MDS tracks dirfrags whose child inodes have caps in open > file table. When MDS recovers, it prefetches all of these dirfrags. This > avoids using backtrace to load inodes. But dirfrags prefetch may load > lots of useless inodes into cache, and make MDS run out of memory. > > Recent MDS adds an option that disables dirfrags prefetch. When dirfrags > prefetch is disabled. Recovering MDS only prefetches corresponding dir > inodes. Including inodes' parent/d_name in cap reconnect message can > help MDS to load inodes into its cache. > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > --- > fs/ceph/mds_client.c | 89 ++++++++++++++++++++++++++++++-------------- > 1 file changed, 61 insertions(+), 28 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 9a09d12569bd..4eaed12b4b4c 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -3553,6 +3553,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) > return err; > } > > +static struct dentry* d_find_primary(struct inode *inode) > +{ > + struct dentry *alias, *dn = NULL; > + > + if (hlist_empty(&inode->i_dentry)) > + return NULL; > + > + spin_lock(&inode->i_lock); > + if (hlist_empty(&inode->i_dentry)) > + goto out_unlock; > + > + if (S_ISDIR(inode->i_mode)) { > + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); > + if (!IS_ROOT(alias)) > + dn = dget(alias); > + goto out_unlock; > + } > + > + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { > + spin_lock(&alias->d_lock); > + if (!d_unhashed(alias) && > + (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { > + dn = dget_dlock(alias); > + } > + spin_unlock(&alias->d_lock); > + if (dn) > + break; > + } > +out_unlock: > + spin_unlock(&inode->i_lock); > + return dn; > +} > + > /* > * Encode information about a cap for a reconnect with the MDS. > */ > @@ -3566,13 +3599,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, > struct ceph_inode_info *ci = cap->ci; > struct ceph_reconnect_state *recon_state = arg; > struct ceph_pagelist *pagelist = recon_state->pagelist; > - int err; > + struct dentry *dentry; > + char *path; > + int pathlen, err; > + u64 pathbase; > u64 snap_follows; > > dout(" adding %p ino %llx.%llx cap %p %lld %s\n", > inode, ceph_vinop(inode), cap, cap->cap_id, > ceph_cap_string(cap->issued)); > > + dentry = d_find_primary(inode); > + if (dentry) { > + /* set pathbase to parent dir when msg_version >= 2 */ > + path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, > + recon_state->msg_version >= 2); One question: Do we really need to build a full path back to the root for the msg_version == 1 case? I notice that the v1 message has a field for the pathbase, which would seem to make the full path unnecessary. Is there some quirk in older MDS versions that requires a full path for this? > + dput(dentry); > + if (IS_ERR(path)) { > + err = PTR_ERR(path); > + goto out_err; > + } > + } else { > + path = NULL; > + pathlen = 0; > + pathbase = 0; > + } > + > spin_lock(&ci->i_ceph_lock); > cap->seq = 0; /* reset cap seq */ > cap->issue_seq = 0; /* and issue_seq */ > @@ -3593,7 +3645,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, > rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); > rec.v2.issued = cpu_to_le32(cap->issued); > rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); > - rec.v2.pathbase = 0; > + rec.v2.pathbase = cpu_to_le64(pathbase); > rec.v2.flock_len = (__force __le32) > ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); > } else { > @@ -3604,7 +3656,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, > ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); > ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); > rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); > - rec.v1.pathbase = 0; > + rec.v1.pathbase = cpu_to_le64(pathbase); > } > > if (list_empty(&ci->i_cap_snaps)) { > @@ -3666,7 +3718,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, > sizeof(struct ceph_filelock); > rec.v2.flock_len = cpu_to_le32(struct_len); > > - struct_len += sizeof(u32) + sizeof(rec.v2); > + struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); > > if (struct_v >= 2) > struct_len += sizeof(u64); /* snap_follows */ > @@ -3690,7 +3742,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, > ceph_pagelist_encode_8(pagelist, 1); > ceph_pagelist_encode_32(pagelist, struct_len); > } > - ceph_pagelist_encode_string(pagelist, NULL, 0); > + ceph_pagelist_encode_string(pagelist, path, pathlen); > ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); > ceph_locks_to_pagelist(flocks, pagelist, > num_fcntl_locks, num_flock_locks); > @@ -3699,39 +3751,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, > out_freeflocks: > kfree(flocks); > } else { > - u64 pathbase = 0; > - int pathlen = 0; > - char *path = NULL; > - struct dentry *dentry; > - > - dentry = d_find_alias(inode); > - if (dentry) { > - path = ceph_mdsc_build_path(dentry, > - &pathlen, &pathbase, 0); > - dput(dentry); > - if (IS_ERR(path)) { > - err = PTR_ERR(path); > - goto out_err; > - } > - rec.v1.pathbase = cpu_to_le64(pathbase); > - } > - > err = ceph_pagelist_reserve(pagelist, > sizeof(u64) + sizeof(u32) + > pathlen + sizeof(rec.v1)); > - if (err) { > - goto out_freepath; > - } > + if (err) > + goto out_err; > > ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); > ceph_pagelist_encode_string(pagelist, path, pathlen); > ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); > -out_freepath: > - ceph_mdsc_free_path(path, pathlen); > } > > out_err: > - if (err >= 0) > + ceph_mdsc_free_path(path, pathlen); > + if (!err) > recon_state->nr_caps++; > return err; > } -- Jeff Layton <jlayton@xxxxxxxxxx>