Re: [PATCH v4 2/2] ceph: wait the first reply of inflight async unlink

Jeff Layton <jlayton@xxxxxxxxxx> · Wed, 18 May 2022 20:26:18 -0400

On Thu, 2022-05-19 at 08:22 +0800, Xiubo Li wrote:
> On 5/19/22 4:28 AM, Jeff Layton wrote:
> > On Wed, 2022-05-18 at 22:45 +0800, Xiubo Li wrote:
> > > In async unlink case the kclient won't wait for the first reply
> > > from MDS and just drop all the links and unhash the dentry and then
> > > succeeds immediately.
> > > 
> > > For any new create/link/rename,etc requests followed by using the
> > > same file names we must wait for the first reply of the inflight
> > > unlink request, or the MDS possibly will fail these following
> > > requests with -EEXIST if the inflight async unlink request was
> > > delayed for some reasons.
> > > 
> > > And the worst case is that for the none async openc request it will
> > > successfully open the file if the CDentry hasn't been unlinked yet,
> > > but later the previous delayed async unlink request will remove the
> > > CDenty. That means the just created file is possiblly deleted later
> > > by accident.
> > > 
> > > We need to wait for the inflight async unlink requests to finish
> > > when creating new files/directories by using the same file names.
> > > 
> > > URL: https://tracker.ceph.com/issues/55332
> > > Reported-by: kernel test robot <lkp@xxxxxxxxx>
> > > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> > > ---
> > >   fs/ceph/dir.c        | 70 +++++++++++++++++++++++++++++++++++++++---
> > >   fs/ceph/file.c       |  4 +++
> > >   fs/ceph/mds_client.c | 73 ++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/ceph/mds_client.h |  1 +
> > >   fs/ceph/super.c      |  3 ++
> > >   fs/ceph/super.h      | 19 +++++++++---
> > >   6 files changed, 160 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > index eae417d71136..01e7facef9b2 100644
> > > --- a/fs/ceph/dir.c
> > > +++ b/fs/ceph/dir.c
> > > @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
> > >   	if (ceph_snap(dir) != CEPH_NOSNAP)
> > >   		return -EROFS;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_quota_is_max_files_exceeded(dir)) {
> > >   		err = -EDQUOT;
> > >   		goto out;
> > > @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
> > >   	if (ceph_snap(dir) != CEPH_NOSNAP)
> > >   		return -EROFS;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_quota_is_max_files_exceeded(dir)) {
> > >   		err = -EDQUOT;
> > >   		goto out;
> > > @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
> > >   	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
> > >   	struct ceph_mds_request *req;
> > >   	struct ceph_acl_sec_ctx as_ctx = {};
> > > -	int err = -EROFS;
> > > +	int err;
> > >   	int op;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_snap(dir) == CEPH_SNAPDIR) {
> > >   		/* mkdir .snap/foo is a MKSNAP */
> > >   		op = CEPH_MDS_OP_MKSNAP;
> > > @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
> > >   		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
> > >   		op = CEPH_MDS_OP_MKDIR;
> > >   	} else {
> > > +		err = -EROFS;
> > >   		goto out;
> > >   	}
> > >   
> > > @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> > >   	struct ceph_mds_request *req;
> > >   	int err;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_snap(dir) != CEPH_NOSNAP)
> > >   		return -EROFS;
> > >   
> > > @@ -1071,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> > >   static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > >   				 struct ceph_mds_request *req)
> > >   {
> > > +	struct dentry *dentry = req->r_dentry;
> > > +	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
> > > +	struct ceph_dentry_info *di = ceph_dentry(dentry);
> > >   	int result = req->r_err ? req->r_err :
> > >   			le32_to_cpu(req->r_reply_info.head->result);
> > >   
> > > +	if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
> > > +		pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
> > > +			__func__, dentry, dentry);
> > > +
> > This is a pr_warn so it won't have a "ceph: " prefix or anything
> > prepending it like dout messages do. You should probably prepend this
> > with something like that.
> 
> The pr_fmt already help add the module name:
> 
> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> 
> Such as for one current log in ceph:
> 
> 2022-05-02T17:28:51.006593+00:00 smithi040 kernel: ceph: ceph: async 
> create failure path=(1)client.0/tmp/ceph/src/os/ObjectStore.h result=-17!
> 
> 

My mistake! Removing those is fine.

> > 
> > > +	spin_lock(&fsc->async_unlink_conflict_lock);
> > > +	hash_del_rcu(&di->hnode);
> > > +	spin_unlock(&fsc->async_unlink_conflict_lock);
> > > +
> > > +	spin_lock(&dentry->d_lock);
> > > +	di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
> > > +	wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
> > > +	spin_unlock(&dentry->d_lock);
> > > +
> > > +	synchronize_rcu();
> > > +
> > >   	if (result == -EJUKEBOX)
> > >   		goto out;
> > >   
> > > @@ -1081,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > >   	if (result) {
> > >   		int pathlen = 0;
> > >   		u64 base = 0;
> > > -		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
> > > +		char *path = ceph_mdsc_build_path(dentry, &pathlen,
> > >   						  &base, 0);
> > >   
> > >   		/* mark error on parent + clear complete */
> > > @@ -1089,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > >   		ceph_dir_clear_complete(req->r_parent);
> > >   
> > >   		/* drop the dentry -- we don't know its status */
> > > -		if (!d_unhashed(req->r_dentry))
> > > -			d_drop(req->r_dentry);
> > > +		if (!d_unhashed(dentry))
> > > +			d_drop(dentry);
> > >   
> > >   		/* mark inode itself for an error (since metadata is bogus) */
> > >   		mapping_set_error(req->r_old_inode->i_mapping, result);
> > >   
> > > -		pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
> > > +		pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
> > Ditto here: why remove the prefix?
> 
> The log will be:
> 
> 2022-05-02T17:28:51.006593+00:00 smithi040 kernel: ceph: ceph: async 
> create failure path=(1)client.0/tmp/ceph/src/os/ObjectStore.h result=-17!
> 
> And the prefix "ceph:" is really needed ?
> 

Nope. Disregard my earlier comment.

> > >   			base, IS_ERR(path) ? "<<bad>>" : path, result);
> > >   		ceph_mdsc_free_path(path, pathlen);
> > >   	}
> > > @@ -1180,6 +1215,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > >   
> > >   	if (try_async && op == CEPH_MDS_OP_UNLINK &&
> > >   	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> > > +		struct ceph_dentry_info *di = ceph_dentry(dentry);
> > > +
> > >   		dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
> > >   		     dentry->d_name.len, dentry->d_name.name,
> > >   		     ceph_cap_string(req->r_dir_caps));
> > > @@ -1187,6 +1224,16 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > >   		req->r_callback = ceph_async_unlink_cb;
> > >   		req->r_old_inode = d_inode(dentry);
> > >   		ihold(req->r_old_inode);
> > > +
> > > +		spin_lock(&dentry->d_lock);
> > > +		di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
> > > +		spin_unlock(&dentry->d_lock);
> > > +
> > > +		spin_lock(&fsc->async_unlink_conflict_lock);
> > > +		hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
> > > +			     dentry->d_name.hash);
> > > +		spin_unlock(&fsc->async_unlink_conflict_lock);
> > > +
> > >   		err = ceph_mdsc_submit_request(mdsc, dir, req);
> > >   		if (!err) {
> > >   			/*
> > > @@ -1198,6 +1245,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > >   		} else if (err == -EJUKEBOX) {
> > >   			try_async = false;
> > >   			ceph_mdsc_put_request(req);
> > > +
> > > +			spin_lock(&dentry->d_lock);
> > > +			di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
> > > +			spin_unlock(&dentry->d_lock);
> > > +
> > > +			spin_lock(&fsc->async_unlink_conflict_lock);
> > > +			hash_del_rcu(&di->hnode);
> > > +			spin_unlock(&fsc->async_unlink_conflict_lock);
> > > +
> > You're clearing the flag here before removing it from the hash. That
> > might end up causing a waiter to trip the pr_warn in
> > ceph_wait_on_conflict_unlink. You probably want to reverse the order
> > here.
> 
> Sure. Will fix it.
> 
> -- Xiubo
> 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>