On Wed, 2022-05-18 at 22:45 +0800, Xiubo Li wrote: > In async unlink case the kclient won't wait for the first reply > from MDS and just drop all the links and unhash the dentry and then > succeeds immediately. > > For any new create/link/rename,etc requests followed by using the > same file names we must wait for the first reply of the inflight > unlink request, or the MDS possibly will fail these following > requests with -EEXIST if the inflight async unlink request was > delayed for some reasons. > > And the worst case is that for the none async openc request it will > successfully open the file if the CDentry hasn't been unlinked yet, > but later the previous delayed async unlink request will remove the > CDenty. That means the just created file is possiblly deleted later > by accident. > > We need to wait for the inflight async unlink requests to finish > when creating new files/directories by using the same file names. > > URL: https://tracker.ceph.com/issues/55332 > Reported-by: kernel test robot <lkp@xxxxxxxxx> > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/dir.c | 70 +++++++++++++++++++++++++++++++++++++++--- > fs/ceph/file.c | 4 +++ > fs/ceph/mds_client.c | 73 ++++++++++++++++++++++++++++++++++++++++++++ > fs/ceph/mds_client.h | 1 + > fs/ceph/super.c | 3 ++ > fs/ceph/super.h | 19 +++++++++--- > 6 files changed, 160 insertions(+), 10 deletions(-) > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c > index eae417d71136..01e7facef9b2 100644 > --- a/fs/ceph/dir.c > +++ b/fs/ceph/dir.c > @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, > if (ceph_snap(dir) != CEPH_NOSNAP) > return -EROFS; > > + err = ceph_wait_on_conflict_unlink(dentry); > + if (err) > + return err; > + > if (ceph_quota_is_max_files_exceeded(dir)) { > err = -EDQUOT; > goto out; > @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, > if (ceph_snap(dir) != CEPH_NOSNAP) > return -EROFS; > > + err = ceph_wait_on_conflict_unlink(dentry); > + if (err) > + return err; > + > if (ceph_quota_is_max_files_exceeded(dir)) { > err = -EDQUOT; > goto out; > @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, > struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); > struct ceph_mds_request *req; > struct ceph_acl_sec_ctx as_ctx = {}; > - int err = -EROFS; > + int err; > int op; > > + err = ceph_wait_on_conflict_unlink(dentry); > + if (err) > + return err; > + > if (ceph_snap(dir) == CEPH_SNAPDIR) { > /* mkdir .snap/foo is a MKSNAP */ > op = CEPH_MDS_OP_MKSNAP; > @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, > dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); > op = CEPH_MDS_OP_MKDIR; > } else { > + err = -EROFS; > goto out; > } > > @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, > struct ceph_mds_request *req; > int err; > > + err = ceph_wait_on_conflict_unlink(dentry); > + if (err) > + return err; > + > if (ceph_snap(dir) != CEPH_NOSNAP) > return -EROFS; > > @@ -1071,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, > static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, > struct ceph_mds_request *req) > { > + struct dentry *dentry = req->r_dentry; > + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); > + struct ceph_dentry_info *di = ceph_dentry(dentry); > int result = req->r_err ? req->r_err : > le32_to_cpu(req->r_reply_info.head->result); > > + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) > + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", > + __func__, dentry, dentry); > + This is a pr_warn so it won't have a "ceph: " prefix or anything prepending it like dout messages do. You should probably prepend this with something like that. > + spin_lock(&fsc->async_unlink_conflict_lock); > + hash_del_rcu(&di->hnode); > + spin_unlock(&fsc->async_unlink_conflict_lock); > + > + spin_lock(&dentry->d_lock); > + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; > + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); > + spin_unlock(&dentry->d_lock); > + > + synchronize_rcu(); > + > if (result == -EJUKEBOX) > goto out; > > @@ -1081,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, > if (result) { > int pathlen = 0; > u64 base = 0; > - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, > + char *path = ceph_mdsc_build_path(dentry, &pathlen, > &base, 0); > > /* mark error on parent + clear complete */ > @@ -1089,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, > ceph_dir_clear_complete(req->r_parent); > > /* drop the dentry -- we don't know its status */ > - if (!d_unhashed(req->r_dentry)) > - d_drop(req->r_dentry); > + if (!d_unhashed(dentry)) > + d_drop(dentry); > > /* mark inode itself for an error (since metadata is bogus) */ > mapping_set_error(req->r_old_inode->i_mapping, result); > > - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", > + pr_warn("async unlink failure path=(%llx)%s result=%d!\n", Ditto here: why remove the prefix? > base, IS_ERR(path) ? "<<bad>>" : path, result); > ceph_mdsc_free_path(path, pathlen); > } > @@ -1180,6 +1215,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) > > if (try_async && op == CEPH_MDS_OP_UNLINK && > (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { > + struct ceph_dentry_info *di = ceph_dentry(dentry); > + > dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), > dentry->d_name.len, dentry->d_name.name, > ceph_cap_string(req->r_dir_caps)); > @@ -1187,6 +1224,16 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) > req->r_callback = ceph_async_unlink_cb; > req->r_old_inode = d_inode(dentry); > ihold(req->r_old_inode); > + > + spin_lock(&dentry->d_lock); > + di->flags |= CEPH_DENTRY_ASYNC_UNLINK; > + spin_unlock(&dentry->d_lock); > + > + spin_lock(&fsc->async_unlink_conflict_lock); > + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode, > + dentry->d_name.hash); > + spin_unlock(&fsc->async_unlink_conflict_lock); > + > err = ceph_mdsc_submit_request(mdsc, dir, req); > if (!err) { > /* > @@ -1198,6 +1245,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) > } else if (err == -EJUKEBOX) { > try_async = false; > ceph_mdsc_put_request(req); > + > + spin_lock(&dentry->d_lock); > + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; > + spin_unlock(&dentry->d_lock); > + > + spin_lock(&fsc->async_unlink_conflict_lock); > + hash_del_rcu(&di->hnode); > + spin_unlock(&fsc->async_unlink_conflict_lock); > + You're clearing the flag here before removing it from the hash. That might end up causing a waiter to trip the pr_warn in ceph_wait_on_conflict_unlink. You probably want to reverse the order here. > goto retry; > } > } else { > @@ -1237,6 +1293,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, > (!ceph_quota_is_same_realm(old_dir, new_dir))) > return -EXDEV; > > + err = ceph_wait_on_conflict_unlink(new_dentry); > + if (err) > + return err; > + > dout("rename dir %p dentry %p to dir %p dentry %p\n", > old_dir, old_dentry, new_dir, new_dentry); > req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 8c8226c0feac..f039e799f5f4 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, > if (dentry->d_name.len > NAME_MAX) > return -ENAMETOOLONG; > > + err = ceph_wait_on_conflict_unlink(dentry); > + if (err) > + return err; > + > if (flags & O_CREAT) { > if (ceph_quota_is_max_files_exceeded(dir)) > return -EDQUOT; > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index e8c87dea0551..ffe52b7c6cbc 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -655,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) > free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); > } > > +/* > + * In async unlink case the kclient won't wait for the first reply > + * from MDS and just drop all the links and unhash the dentry and then > + * succeeds immediately. > + * > + * For any new create/link/rename,etc requests followed by using the > + * same file names we must wait for the first reply of the inflight > + * unlink request, or the MDS possibly will fail these following > + * requests with -EEXIST if the inflight async unlink request was > + * delayed for some reasons. > + * > + * And the worst case is that for the none async openc request it will > + * successfully open the file if the CDentry hasn't been unlinked yet, > + * but later the previous delayed async unlink request will remove the > + * CDenty. That means the just created file is possiblly deleted later > + * by accident. > + * > + * We need to wait for the inflight async unlink requests to finish > + * when creating new files/directories by using the same file names. > + */ > +int ceph_wait_on_conflict_unlink(struct dentry *dentry) > +{ > + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); > + struct dentry *pdentry = dentry->d_parent; > + struct dentry *udentry, *found = NULL; > + struct ceph_dentry_info *di; > + struct qstr dname; > + u32 hash = dentry->d_name.hash; > + int err; > + > + dname.name = dentry->d_name.name; > + dname.len = dentry->d_name.len; > + > + rcu_read_lock(); > + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, > + hnode, hash) { > + udentry = di->dentry; > + > + spin_lock(&udentry->d_lock); > + if (udentry->d_name.hash != hash) > + goto next; > + if (unlikely(udentry->d_parent != pdentry)) > + goto next; > + if (!hash_hashed(&di->hnode)) > + goto next; > + > + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) > + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", > + __func__, dentry, dentry); > + > + if (d_compare(pdentry, udentry, &dname)) > + goto next; > + > + spin_unlock(&udentry->d_lock); > + found = dget(udentry); > + break; > +next: > + spin_unlock(&udentry->d_lock); > + } > + rcu_read_unlock(); > + > + if (likely(!found)) > + return 0; > + > + dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__, > + dentry, dentry, found, found); > + > + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, > + TASK_KILLABLE); > + dput(found); > + return err; > +} > + > > /* > * sessions > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index 33497846e47e..d1ae679c52c3 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -582,6 +582,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode) > TASK_INTERRUPTIBLE); > } > > +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); > extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); > extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); > #endif > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > index b73b4f75462c..6542b71f8627 100644 > --- a/fs/ceph/super.c > +++ b/fs/ceph/super.c > @@ -816,6 +816,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, > if (!fsc->cap_wq) > goto fail_inode_wq; > > + hash_init(fsc->async_unlink_conflict); > + spin_lock_init(&fsc->async_unlink_conflict_lock); > + > spin_lock(&ceph_fsc_lock); > list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); > spin_unlock(&ceph_fsc_lock); > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index 506d52633627..251e726ec628 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -19,6 +19,7 @@ > #include <linux/security.h> > #include <linux/netfs.h> > #include <linux/fscache.h> > +#include <linux/hashtable.h> > > #include <linux/ceph/libceph.h> > > @@ -99,6 +100,8 @@ struct ceph_mount_options { > char *mon_addr; > }; > > +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 > + > struct ceph_fs_client { > struct super_block *sb; > > @@ -124,6 +127,9 @@ struct ceph_fs_client { > struct workqueue_struct *inode_wq; > struct workqueue_struct *cap_wq; > > + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); > + spinlock_t async_unlink_conflict_lock; > + > #ifdef CONFIG_DEBUG_FS > struct dentry *debugfs_dentry_lru, *debugfs_caps; > struct dentry *debugfs_congestion_kb; > @@ -281,7 +287,8 @@ struct ceph_dentry_info { > struct dentry *dentry; > struct ceph_mds_session *lease_session; > struct list_head lease_list; > - unsigned flags; > + struct hlist_node hnode; > + unsigned long flags; > int lease_shared_gen; > u32 lease_gen; > u32 lease_seq; > @@ -290,10 +297,12 @@ struct ceph_dentry_info { > u64 offset; > }; > > -#define CEPH_DENTRY_REFERENCED 1 > -#define CEPH_DENTRY_LEASE_LIST 2 > -#define CEPH_DENTRY_SHRINK_LIST 4 > -#define CEPH_DENTRY_PRIMARY_LINK 8 > +#define CEPH_DENTRY_REFERENCED (1 << 0) > +#define CEPH_DENTRY_LEASE_LIST (1 << 1) > +#define CEPH_DENTRY_SHRINK_LIST (1 << 2) > +#define CEPH_DENTRY_PRIMARY_LINK (1 << 3) > +#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) > +#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) > > struct ceph_inode_xattrs_info { > /* -- Jeff Layton <jlayton@xxxxxxxxxx>