On Tue, 2017-12-05 at 09:51 +0800, Yan, Zheng wrote: > Readdir cache keeps array of dentry pointers in page cache. If any > dentry in readdir cache gets pruned, ceph_d_prune() disables readdir > cache for later readdir syscall. The problem is that ceph_d_prune() > ignores unhashed dentry. Ideally MDS should have already revoked > CEPH_CAP_FILE_SHARED (which also disables readdir cache) when dentry > gets unhashed. But if it is somehow MDS does not properly revoke > CEPH_CAP_FILE_SHARED and the unhashed dentry gets pruned later, > ceph_d_prune() will not disable readdir cache, later readdir may > reference invalid dentry pointer. > > The fix is make ceph_d_prune() do extra check for unhashed dentry. > Disable readdir cache if the unhashed dentry is still referenced > by readdir cache. > > Another fix in this patch is handle d_splice_alias(). If a dentry > gets spliced into new parent dentry, treat it as if it was pruned > (call ceph_d_prune() for it). > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > --- > fs/ceph/dir.c | 46 +++++++++++++++++++++++++++++++++------------- > fs/ceph/inode.c | 40 ++++++++++++++++++++++++++++++++++------ > 2 files changed, 67 insertions(+), 19 deletions(-) > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c > index 0a23a4d9fde8..793306f5352c 100644 > --- a/fs/ceph/dir.c > +++ b/fs/ceph/dir.c > @@ -231,11 +231,17 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, > goto out; > } > > - di = ceph_dentry(dentry); > spin_lock(&dentry->d_lock); > - if (di->lease_shared_gen == shared_gen && > - d_really_is_positive(dentry) && > - fpos_cmp(ctx->pos, di->offset) <= 0) { > + di = ceph_dentry(dentry); > + if (d_unhashed(dentry) || > + d_really_is_negative(dentry) || > + di->lease_shared_gen != shared_gen) { > + spin_unlock(&dentry->d_lock); > + dput(dentry); > + err = -EAGAIN; > + goto out; > + } > + if (fpos_cmp(ctx->pos, di->offset) <= 0) { > emit_dentry = true; > } > spin_unlock(&dentry->d_lock); > @@ -1324,24 +1330,38 @@ static void ceph_d_release(struct dentry *dentry) > */ > static void ceph_d_prune(struct dentry *dentry) > { > - dout("ceph_d_prune %p\n", dentry); > + struct inode *dir; > + struct ceph_dentry_info *di; > + > + dout("ceph_d_prune %pd %p\n", dentry, dentry); > > /* do we have a valid parent? */ > if (IS_ROOT(dentry)) > return; > > - /* if we are not hashed, we don't affect dir's completeness */ > - if (d_unhashed(dentry)) > + /* we hold d_lock, so d_parent is stable */ > + dir = d_inode(dentry->d_parent); > + if (ceph_snap(dir) == CEPH_SNAPDIR) > return; > > - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) > + /* who calls d_delete() should also disable dcache readdir */ > + if (d_really_is_negative(dentry)) > return; > > - /* > - * we hold d_lock, so d_parent is stable, and d_fsdata is never > - * cleared until d_release > - */ > - ceph_dir_clear_complete(d_inode(dentry->d_parent)); > + /* d_fsdata does not get cleared until d_release */ > + if (!d_unhashed(dentry)) { > + ceph_dir_clear_complete(dir); > + return; > + } > + > + /* Disable dcache readdir just in case that someone called d_drop() > + * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED > + * properly (dcache readdir is still enabled) */ > + di = ceph_dentry(dentry); > + if (di->offset > 0 && > + di->lease_shared_gen == > + atomic64_read(&ceph_inode(dir)->i_shared_gen)) > + ceph_dir_clear_ordered(dir); > } > > /* > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > index a9de83f012bf..01e5b460efb8 100644 > --- a/fs/ceph/inode.c > +++ b/fs/ceph/inode.c > @@ -1080,6 +1080,27 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) > > BUG_ON(d_inode(dn)); > > + if (S_ISDIR(in->i_mode)) { > + /* If inode is directory, d_splice_alias() below will remove > + * 'realdn' from its origin parent. We need to ensure that > + * origin parent's readdir cache will not reference 'realdn' > + */ > + realdn = d_find_any_alias(in); > + if (realdn) { > + struct ceph_dentry_info *di = ceph_dentry(realdn); > + spin_lock(&realdn->d_lock); > + > + realdn->d_op->d_prune(realdn); > + > + di->time = jiffies; > + di->lease_shared_gen = 0; > + di->offset = 0; > + > + spin_unlock(&realdn->d_lock); > + dput(realdn); > + } > + } > + > /* dn must be unhashed */ > if (!d_unhashed(dn)) > d_drop(dn); > @@ -1295,8 +1316,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) > if (!rinfo->head->is_target) { > dout("fill_trace null dentry\n"); > if (d_really_is_positive(dn)) { > - ceph_dir_clear_ordered(dir); > dout("d_delete %p\n", dn); > + ceph_dir_clear_ordered(dir); > d_delete(dn); > } else if (have_lease) { > if (d_unhashed(dn)) > @@ -1323,7 +1344,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) > dout(" %p links to %p %llx.%llx, not %llx.%llx\n", > dn, d_inode(dn), ceph_vinop(d_inode(dn)), > ceph_vinop(in)); > - ceph_dir_clear_ordered(dir); > d_invalidate(dn); > have_lease = false; > } > @@ -1573,9 +1593,19 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, > } else if (d_really_is_positive(dn) && > (ceph_ino(d_inode(dn)) != tvino.ino || > ceph_snap(d_inode(dn)) != tvino.snap)) { > + struct ceph_dentry_info *di = ceph_dentry(dn); > dout(" dn %p points to wrong inode %p\n", > dn, d_inode(dn)); > - __ceph_dir_clear_ordered(ci); > + > + spin_lock(&dn->d_lock); > + if (di->offset > 0 && > + di->lease_shared_gen == > + atomic64_read(&ci->i_shared_gen)) { > + __ceph_dir_clear_ordered(ci); > + di->offset = 0; > + } > + spin_unlock(&dn->d_lock); > + > d_delete(dn); > dput(dn); > goto retry_lookup; > @@ -1600,9 +1630,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, > &req->r_caps_reservation); > if (ret < 0) { > pr_err("fill_inode badness on %p\n", in); > - if (d_really_is_positive(dn)) > - __ceph_dir_clear_ordered(ci); > - else > + if (d_really_is_negative(dn)) > iput(in); > d_drop(dn); > err = ret; FWIW, I really hate how ceph keeps pointers to dentries without references and relies on all of this skulduggery to keep from dereferencing bogus ones. It all seems so dangerous. That said, I don't see anything wrong with this patch right offhand. Acked-by: Jeff Layton <jlayton@xxxxxxxxxx> -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html