Re: [PATCH RFC] ceph: flush the delayed caps in time

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 2021-07-21 at 16:27 +0800, xiubli@xxxxxxxxxx wrote:
> From: Xiubo Li <xiubli@xxxxxxxxxx>
> 
> The delayed_work will be executed per 5 seconds, during this time
> the cap_delay_list may accumulate thounsands of caps need to flush,
> this will make the MDS's dispatch queue be full and need a very long
> time to handle them. And if there has some other operations, likes
> a rmdir request, it will be add in the tail of dispath queue and
> need to wait for several or tens of seconds.
> 
> In client side we shouldn't queue to many of the cap requests and
> flush them if there has more than 100 items.
> 

Why 100? My inclination is to say NAK on this.

I'm not a fan of this sort of arbitrary throttling in order to alleviate
a scalability problem in the MDS. The cap delay logic is already
horrifically complex as well, and this just makes it worse. If the MDS
happens to be processing requests from a lot of clients, this may not
help at all.

> URL: https://tracker.ceph.com/issues/51734

The core problem sounds like the MDS got a call and just took way too
long to get to it. Shouldn't we be addressing this in the MDS instead?
It sounds like it needs to do a better job of parallelizing cap flushes.

> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> ---
>  fs/ceph/caps.c       | 21 ++++++++++++++++++++-
>  fs/ceph/mds_client.c |  3 ++-
>  fs/ceph/mds_client.h |  3 +++
>  3 files changed, 25 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 4b966c29d9b5..064865761d2b 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -507,6 +507,8 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
>  static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  				struct ceph_inode_info *ci)
>  {
> +	int num = 0;
> +
>  	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
>  	     ci->i_ceph_flags, ci->i_hold_caps_max);
>  	if (!mdsc->stopping) {
> @@ -515,12 +517,19 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  			if (ci->i_ceph_flags & CEPH_I_FLUSH)
>  				goto no_change;
>  			list_del_init(&ci->i_cap_delay_list);
> +			mdsc->num_cap_delay--;
>  		}
>  		__cap_set_timeouts(mdsc, ci);
>  		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> +		num = ++mdsc->num_cap_delay;
>  no_change:
>  		spin_unlock(&mdsc->cap_delay_lock);
>  	}
> +
> +	if (num > 100) {
> +		flush_delayed_work(&mdsc->delayed_work);
> +		schedule_delayed(mdsc);
> +	}
>  }
>  
>  /*
> @@ -531,13 +540,23 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>  static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
>  				      struct ceph_inode_info *ci)
>  {
> +	int num;
> +
>  	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
>  	spin_lock(&mdsc->cap_delay_lock);
>  	ci->i_ceph_flags |= CEPH_I_FLUSH;
> -	if (!list_empty(&ci->i_cap_delay_list))
> +	if (!list_empty(&ci->i_cap_delay_list)) {
>  		list_del_init(&ci->i_cap_delay_list);
> +		mdsc->num_cap_delay--;
> +	}
>  	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
> +	num = ++mdsc->num_cap_delay;
>  	spin_unlock(&mdsc->cap_delay_lock);
> +
> +	if (num > 100) {
> +		flush_delayed_work(&mdsc->delayed_work);
> +		schedule_delayed(mdsc);
> +	}
>  }
>  
>  /*
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 79aa4ce3a388..14e44de05812 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4514,7 +4514,7 @@ void inc_session_sequence(struct ceph_mds_session *s)
>  /*
>   * delayed work -- periodically trim expired leases, renew caps with mds
>   */
> -static void schedule_delayed(struct ceph_mds_client *mdsc)
> +void schedule_delayed(struct ceph_mds_client *mdsc)
>  {
>  	int delay = 5;
>  	unsigned hz = round_jiffies_relative(HZ * delay);
> @@ -4616,6 +4616,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>  	mdsc->request_tree = RB_ROOT;
>  	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
>  	mdsc->last_renew_caps = jiffies;
> +	mdsc->num_cap_delay = 0;
>  	INIT_LIST_HEAD(&mdsc->cap_delay_list);
>  	INIT_LIST_HEAD(&mdsc->cap_wait_list);
>  	spin_lock_init(&mdsc->cap_delay_lock);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index a7af09257382..b4289b8d23ec 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -423,6 +423,7 @@ struct ceph_mds_client {
>  	struct rb_root         request_tree;  /* pending mds requests */
>  	struct delayed_work    delayed_work;  /* delayed work */
>  	unsigned long    last_renew_caps;  /* last time we renewed our caps */
> +	unsigned long    num_cap_delay;    /* caps in the cap_delay_list */
>  	struct list_head cap_delay_list;   /* caps with delayed release */
>  	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
>  	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
> @@ -568,6 +569,8 @@ extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>  			  struct ceph_mds_session *session,
>  			  int max_caps);
>  
> +extern void schedule_delayed(struct ceph_mds_client *mdsc);
> +
>  static inline int ceph_wait_on_async_create(struct inode *inode)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);

-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux