> On 5 Dec 2017, at 19:31, Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > On Tue, 2017-12-05 at 09:51 +0800, Yan, Zheng wrote: >> Readdir cache keeps array of dentry pointers in page cache. If any >> dentry in readdir cache gets pruned, ceph_d_prune() disables readdir >> cache for later readdir syscall. The problem is that ceph_d_prune() >> ignores unhashed dentry. Ideally MDS should have already revoked >> CEPH_CAP_FILE_SHARED (which also disables readdir cache) when dentry >> gets unhashed. But if it is somehow MDS does not properly revoke >> CEPH_CAP_FILE_SHARED and the unhashed dentry gets pruned later, >> ceph_d_prune() will not disable readdir cache, later readdir may >> reference invalid dentry pointer. >> >> The fix is make ceph_d_prune() do extra check for unhashed dentry. >> Disable readdir cache if the unhashed dentry is still referenced >> by readdir cache. >> >> Another fix in this patch is handle d_splice_alias(). If a dentry >> gets spliced into new parent dentry, treat it as if it was pruned >> (call ceph_d_prune() for it). >> >> Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> >> --- >> fs/ceph/dir.c | 46 +++++++++++++++++++++++++++++++++------------- >> fs/ceph/inode.c | 40 ++++++++++++++++++++++++++++++++++------ >> 2 files changed, 67 insertions(+), 19 deletions(-) >> >> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c >> index 0a23a4d9fde8..793306f5352c 100644 >> --- a/fs/ceph/dir.c >> +++ b/fs/ceph/dir.c >> @@ -231,11 +231,17 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, >> goto out; >> } >> >> - di = ceph_dentry(dentry); >> spin_lock(&dentry->d_lock); >> - if (di->lease_shared_gen == shared_gen && >> - d_really_is_positive(dentry) && >> - fpos_cmp(ctx->pos, di->offset) <= 0) { >> + di = ceph_dentry(dentry); >> + if (d_unhashed(dentry) || >> + d_really_is_negative(dentry) || >> + di->lease_shared_gen != shared_gen) { >> + spin_unlock(&dentry->d_lock); >> + dput(dentry); >> + err = -EAGAIN; >> + goto out; >> + } >> + if (fpos_cmp(ctx->pos, di->offset) <= 0) { >> emit_dentry = true; >> } >> spin_unlock(&dentry->d_lock); >> @@ -1324,24 +1330,38 @@ static void ceph_d_release(struct dentry *dentry) >> */ >> static void ceph_d_prune(struct dentry *dentry) >> { >> - dout("ceph_d_prune %p\n", dentry); >> + struct inode *dir; >> + struct ceph_dentry_info *di; >> + >> + dout("ceph_d_prune %pd %p\n", dentry, dentry); >> >> /* do we have a valid parent? */ >> if (IS_ROOT(dentry)) >> return; >> >> - /* if we are not hashed, we don't affect dir's completeness */ >> - if (d_unhashed(dentry)) >> + /* we hold d_lock, so d_parent is stable */ >> + dir = d_inode(dentry->d_parent); >> + if (ceph_snap(dir) == CEPH_SNAPDIR) >> return; >> >> - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) >> + /* who calls d_delete() should also disable dcache readdir */ >> + if (d_really_is_negative(dentry)) >> return; >> >> - /* >> - * we hold d_lock, so d_parent is stable, and d_fsdata is never >> - * cleared until d_release >> - */ >> - ceph_dir_clear_complete(d_inode(dentry->d_parent)); >> + /* d_fsdata does not get cleared until d_release */ >> + if (!d_unhashed(dentry)) { >> + ceph_dir_clear_complete(dir); >> + return; >> + } >> + >> + /* Disable dcache readdir just in case that someone called d_drop() >> + * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED >> + * properly (dcache readdir is still enabled) */ >> + di = ceph_dentry(dentry); >> + if (di->offset > 0 && >> + di->lease_shared_gen == >> + atomic64_read(&ceph_inode(dir)->i_shared_gen)) >> + ceph_dir_clear_ordered(dir); >> } >> >> /* >> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c >> index a9de83f012bf..01e5b460efb8 100644 >> --- a/fs/ceph/inode.c >> +++ b/fs/ceph/inode.c >> @@ -1080,6 +1080,27 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) >> >> BUG_ON(d_inode(dn)); >> >> + if (S_ISDIR(in->i_mode)) { >> + /* If inode is directory, d_splice_alias() below will remove >> + * 'realdn' from its origin parent. We need to ensure that >> + * origin parent's readdir cache will not reference 'realdn' >> + */ >> + realdn = d_find_any_alias(in); >> + if (realdn) { >> + struct ceph_dentry_info *di = ceph_dentry(realdn); >> + spin_lock(&realdn->d_lock); >> + >> + realdn->d_op->d_prune(realdn); >> + >> + di->time = jiffies; >> + di->lease_shared_gen = 0; >> + di->offset = 0; >> + >> + spin_unlock(&realdn->d_lock); >> + dput(realdn); >> + } >> + } >> + >> /* dn must be unhashed */ >> if (!d_unhashed(dn)) >> d_drop(dn); >> @@ -1295,8 +1316,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) >> if (!rinfo->head->is_target) { >> dout("fill_trace null dentry\n"); >> if (d_really_is_positive(dn)) { >> - ceph_dir_clear_ordered(dir); >> dout("d_delete %p\n", dn); >> + ceph_dir_clear_ordered(dir); >> d_delete(dn); >> } else if (have_lease) { >> if (d_unhashed(dn)) >> @@ -1323,7 +1344,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) >> dout(" %p links to %p %llx.%llx, not %llx.%llx\n", >> dn, d_inode(dn), ceph_vinop(d_inode(dn)), >> ceph_vinop(in)); >> - ceph_dir_clear_ordered(dir); >> d_invalidate(dn); >> have_lease = false; >> } >> @@ -1573,9 +1593,19 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, >> } else if (d_really_is_positive(dn) && >> (ceph_ino(d_inode(dn)) != tvino.ino || >> ceph_snap(d_inode(dn)) != tvino.snap)) { >> + struct ceph_dentry_info *di = ceph_dentry(dn); >> dout(" dn %p points to wrong inode %p\n", >> dn, d_inode(dn)); >> - __ceph_dir_clear_ordered(ci); >> + >> + spin_lock(&dn->d_lock); >> + if (di->offset > 0 && >> + di->lease_shared_gen == >> + atomic64_read(&ci->i_shared_gen)) { >> + __ceph_dir_clear_ordered(ci); >> + di->offset = 0; >> + } >> + spin_unlock(&dn->d_lock); >> + >> d_delete(dn); >> dput(dn); >> goto retry_lookup; >> @@ -1600,9 +1630,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, >> &req->r_caps_reservation); >> if (ret < 0) { >> pr_err("fill_inode badness on %p\n", in); >> - if (d_really_is_positive(dn)) >> - __ceph_dir_clear_ordered(ci); >> - else >> + if (d_really_is_negative(dn)) >> iput(in); >> d_drop(dn); >> err = ret; > > > FWIW, I really hate how ceph keeps pointers to dentries without > references and relies on all of this skulduggery to keep from > dereferencing bogus ones. It all seems so dangerous. I don’t like this neither. It has caused so many trouble. > > That said, I don't see anything wrong with this patch right offhand. > > Acked-by: Jeff Layton <jlayton@xxxxxxxxxx> -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html