On Wed, Jan 17, 2024 at 10:00 AM <xiubli@xxxxxxxxxx> wrote: > > From: Xiubo Li <xiubli@xxxxxxxxxx> > > When unlinking a file the check caps could be delayed for more than > 5 seconds, but in MDS side it maybe waiting for the clients to > release caps. > > This will use the cap_wq work queue and a dedicated list to help > fire the check_caps() and dirty buffer flushing immediately. > > URL: https://tracker.ceph.com/issues/50223 > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/caps.c | 17 +++++++++++++++- > fs/ceph/mds_client.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ > fs/ceph/mds_client.h | 5 +++++ > 3 files changed, 69 insertions(+), 1 deletion(-) > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > index c0db0e9e82d2..ba94ad6d45fe 100644 > --- a/fs/ceph/caps.c > +++ b/fs/ceph/caps.c > @@ -4785,7 +4785,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode) > if (__ceph_caps_dirty(ci)) { > struct ceph_mds_client *mdsc = > ceph_inode_to_fs_client(inode)->mdsc; > - __cap_delay_requeue_front(mdsc, ci); > + > + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, > + ceph_vinop(inode)); > + spin_lock(&mdsc->cap_unlink_delay_lock); > + ci->i_ceph_flags |= CEPH_I_FLUSH; > + if (!list_empty(&ci->i_cap_delay_list)) > + list_del_init(&ci->i_cap_delay_list); > + list_add_tail(&ci->i_cap_delay_list, > + &mdsc->cap_unlink_delay_list); > + spin_unlock(&mdsc->cap_unlink_delay_lock); > + > + /* > + * Fire the work immediately, because the MDS maybe > + * waiting for caps release. > + */ > + ceph_queue_cap_unlink_work(mdsc); > } > } > spin_unlock(&ci->i_ceph_lock); > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 29295041b7b4..e2352e94c5bc 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -2512,6 +2512,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) > } > } > > +void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc) > +{ > + struct ceph_client *cl = mdsc->fsc->client; > + if (mdsc->stopping) > + return; > + > + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) { > + doutc(cl, "caps unlink work queued\n"); > + } else { > + doutc(cl, "failed to queue caps unlink work\n"); > + } > +} > + > +static void ceph_cap_unlink_work(struct work_struct *work) > +{ > + struct ceph_mds_client *mdsc = > + container_of(work, struct ceph_mds_client, cap_unlink_work); > + struct ceph_client *cl = mdsc->fsc->client; > + > + doutc(cl, "begin\n"); > + spin_lock(&mdsc->cap_unlink_delay_lock); > + while (!list_empty(&mdsc->cap_unlink_delay_list)) { > + struct ceph_inode_info *ci; > + struct inode *inode; > + > + ci = list_first_entry(&mdsc->cap_unlink_delay_list, > + struct ceph_inode_info, > + i_cap_delay_list); > + list_del_init(&ci->i_cap_delay_list); > + > + inode = igrab(&ci->netfs.inode); > + if (inode) { > + spin_unlock(&mdsc->cap_unlink_delay_lock); > + doutc(cl, "on %p %llx.%llx\n", inode, > + ceph_vinop(inode)); > + ceph_check_caps(ci, CHECK_CAPS_FLUSH); > + iput(inode); > + spin_lock(&mdsc->cap_unlink_delay_lock); > + } > + } > + spin_unlock(&mdsc->cap_unlink_delay_lock); > + doutc(cl, "done\n"); > +} > + > /* > * requests > */ > @@ -5493,6 +5537,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) > INIT_LIST_HEAD(&mdsc->cap_delay_list); > INIT_LIST_HEAD(&mdsc->cap_wait_list); > spin_lock_init(&mdsc->cap_delay_lock); > + INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); > + spin_lock_init(&mdsc->cap_unlink_delay_lock); > INIT_LIST_HEAD(&mdsc->snap_flush_list); > spin_lock_init(&mdsc->snap_flush_lock); > mdsc->last_cap_flush_tid = 1; > @@ -5501,6 +5547,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) > spin_lock_init(&mdsc->cap_dirty_lock); > init_waitqueue_head(&mdsc->cap_flushing_wq); > INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); > + INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); > err = ceph_metric_init(&mdsc->metric); > if (err) > goto err_mdsmap; > @@ -5931,6 +5978,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) > ceph_cleanup_global_and_empty_realms(mdsc); > > cancel_work_sync(&mdsc->cap_reclaim_work); > + cancel_work_sync(&mdsc->cap_unlink_work); > cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ > > doutc(cl, "done\n"); > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index 65f0720d1671..317a0fd6a8ba 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -482,6 +482,8 @@ struct ceph_mds_client { > unsigned long last_renew_caps; /* last time we renewed our caps */ > struct list_head cap_delay_list; /* caps with delayed release */ > spinlock_t cap_delay_lock; /* protects cap_delay_list */ > + struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ > + spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ > struct list_head snap_flush_list; /* cap_snaps ready to flush */ > spinlock_t snap_flush_lock; > > @@ -495,6 +497,8 @@ struct ceph_mds_client { > struct work_struct cap_reclaim_work; > atomic_t cap_reclaim_pending; > > + struct work_struct cap_unlink_work; > + > /* > * Cap reservations > * > @@ -597,6 +601,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, > struct ceph_mds_session *session); > extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); > extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); > +extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); > extern int ceph_iterate_session_caps(struct ceph_mds_session *session, > int (*cb)(struct inode *, int mds, void *), > void *arg); > -- > 2.43.0 > Tested-by: Venky Shankar <vshankar@xxxxxxxxxx> -- Cheers, Venky