Re: [RFC PATCH] sched/fair: Filter by css_is_dying() in the last tg_unthrottle_up()

Mathias Krause <minipli@xxxxxxxxxxxxxx> · Mon, 29 Nov 2021 12:33:35 +0100

Am 26.11.21 um 14:06 schrieb Michal Koutný:
> The commit b027789e5e50 ("sched/fair: Prevent dead task groups from
> regaining cfs_rq's") tackled the issue of use-after-free of cfs_rqs
> surviving on the leaf list after cgroup removal. Two key changes of the
> fix are:
> - move destroy_cfs_bandwidth() before the list_del_leaf_cfs_rq() so that
>   tg_unthrottle_up(cfs_rq) will not re-add the cfs_rq into the list,
> - insert RCU GP between task_group tree unlinking and leaf list removal
>   (ancestors walk the tree and could call tg_unthrottle_up() (re-add)
>   too).
> 
> This is correct but it makes the task_group removal path in cpu
> controller unnecessarily complex. This patch simplifies the code by
> removing the GP but ensuring that cfs_rq won't be re-added when it is
> going away. The css_is_dying() check and list_add_leaf_cfs_rq() cannot
> race against sched_released_group() because they are both in one RCU
> read section and cpu_cgroup_css_released() is called after an RCU GP
> (more precisely the sequence is: rmdir, css_is_dying()=1, GP,
> .css_offline(), .css_released()).
> 
> Autogroups are not cgroups so they are not affected by bandwidth timer
> (GP before free is kept).
> 
> Cc: Tejun Heo <tj@xxxxxxxxxx>
> Signed-off-by: Michal Koutný <mkoutny@xxxxxxxx>
> ---
>  kernel/sched/autogroup.c | 10 ++++++--
>  kernel/sched/core.c      | 55 ++++++++--------------------------------
>  kernel/sched/fair.c      |  6 +++++
>  kernel/sched/sched.h     |  4 +--
>  4 files changed, 27 insertions(+), 48 deletions(-)
> 
> As explained in the message, this relies on the RCU GP between css_is_dying()
> returning false and the potential .css_offline() call. 
> This is stated in the css_is_dying() documentation:
> 
>> the actual offline operations are RCU delayed and this test returns %true
>> also when @css is scheduled to be offlined.
> 
> On the other hand the documentation of the underlying
> percpu_ref_kill_and_confirm() says (to discourage relying on GP):
> 
>> There are no implied RCU grace periods between kill and release.
> 
> This seems to discord with each other at first thought. (That's why I marked
> this RFC.)
> 
> However, if one takes into account that percpu_refs as used by css are never
> switched to atomic besides the actual killing (and they start in per-cpu mode),
> the GP (inserted in __percpu_ref_switch_to_atomic()) is warranted.

This feels fragile and is very implicit. It, at least, needs a
corresponding mention in the comment above the call to
percpu_ref_kill_and_confirm(). But even then, the RCU GP is an
implementation detail of percpu_refs and us relying on it is kind of
ignoring the API guarantees. Doesn't feel right. :/

> 
> 
> diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
> index 8629b37d118e..e53d1ea9afc0 100644
> --- a/kernel/sched/autogroup.c
> +++ b/kernel/sched/autogroup.c
> @@ -22,6 +22,11 @@ void autogroup_free(struct task_group *tg)
>  	kfree(tg->autogroup);
>  }
>  
> +static void sched_free_group_rcu(struct rcu_head *rcu)
> +{
> +	sched_free_group(container_of(rcu, struct task_group, rcu));
> +}
> +
>  static inline void autogroup_destroy(struct kref *kref)
>  {
>  	struct autogroup *ag = container_of(kref, struct autogroup, kref);
> @@ -31,8 +36,9 @@ static inline void autogroup_destroy(struct kref *kref)
>  	ag->tg->rt_se = NULL;
>  	ag->tg->rt_rq = NULL;
>  #endif
> -	sched_release_group(ag->tg);
> -	sched_destroy_group(ag->tg);
> +	sched_released_group(ag->tg);
> +	/* Wait for possible concurrent references to cfs_rqs complete: */
> +	call_rcu(&ag->tg->rcu, sched_free_group_rcu);
>  }

However, I do like the above cleanup, as it adds an explicit RCU only to
autogroup, which needs it, instead to adding it too all other users.

>  
>  static inline void autogroup_kref_put(struct autogroup *ag)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3c9b0fda64ac..2c5b22f54ab8 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9711,7 +9711,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
>  #endif
>  }
>  
> -static void sched_free_group(struct task_group *tg)
> +void sched_free_group(struct task_group *tg)
>  {
>  	free_fair_sched_group(tg);
>  	free_rt_sched_group(tg);
> @@ -9719,22 +9719,6 @@ static void sched_free_group(struct task_group *tg)
>  	kmem_cache_free(task_group_cache, tg);
>  }
>  
> -static void sched_free_group_rcu(struct rcu_head *rcu)
> -{
> -	sched_free_group(container_of(rcu, struct task_group, rcu));
> -}
> -
> -static void sched_unregister_group(struct task_group *tg)
> -{
> -	unregister_fair_sched_group(tg);
> -	unregister_rt_sched_group(tg);
> -	/*
> -	 * We have to wait for yet another RCU grace period to expire, as
> -	 * print_cfs_stats() might run concurrently.
> -	 */
> -	call_rcu(&tg->rcu, sched_free_group_rcu);
> -}
> -
>  /* allocate runqueue etc for a new task group */
>  struct task_group *sched_create_group(struct task_group *parent)
>  {
> @@ -9777,40 +9761,23 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
>  	online_fair_sched_group(tg);
>  }
>  
> -/* rcu callback to free various structures associated with a task group */
> -static void sched_unregister_group_rcu(struct rcu_head *rhp)
> -{
> -	/* Now it should be safe to free those cfs_rqs: */
> -	sched_unregister_group(container_of(rhp, struct task_group, rcu));
> -}
> -
> -void sched_destroy_group(struct task_group *tg)
> -{
> -	/* Wait for possible concurrent references to cfs_rqs complete: */
> -	call_rcu(&tg->rcu, sched_unregister_group_rcu);
> -}
> -
> -void sched_release_group(struct task_group *tg)
> +void sched_released_group(struct task_group *tg)
>  {
>  	unsigned long flags;
>  
>  	/*
> -	 * Unlink first, to avoid walk_tg_tree_from() from finding us (via
> -	 * sched_cfs_period_timer()).
> -	 *
> -	 * For this to be effective, we have to wait for all pending users of
> -	 * this task group to leave their RCU critical section to ensure no new
> -	 * user will see our dying task group any more. Specifically ensure
> -	 * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
> -	 *
> -	 * We therefore defer calling unregister_fair_sched_group() to
> -	 * sched_unregister_group() which is guarantied to get called only after the
> -	 * current RCU grace period has expired.
> +	 * We get here only after all CPUs see tg as dying and an RCU grace
> +	 * period. Despite that there may still be concurrent RCU readers
> +	 * (walk_tg_tree_from() or task_group list) in their RCU sections.
> +	 * Their references to tg stay valid only inside the RCU read section.
>  	 */
>  	spin_lock_irqsave(&task_group_lock, flags);
>  	list_del_rcu(&tg->list);
>  	list_del_rcu(&tg->siblings);
>  	spin_unlock_irqrestore(&task_group_lock, flags);
> +
> +	unregister_fair_sched_group(tg);
> +	unregister_rt_sched_group(tg);
>  }
>  
>  static void sched_change_group(struct task_struct *tsk, int type)
> @@ -9925,7 +9892,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
>  {
>  	struct task_group *tg = css_tg(css);
>  
> -	sched_release_group(tg);
> +	sched_released_group(tg);
>  }
>  
>  static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
> @@ -9935,7 +9902,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
>  	/*
>  	 * Relies on the RCU grace period between css_released() and this.
>  	 */
> -	sched_unregister_group(tg);
> +	sched_free_group(tg);
>  }
>  
>  /*
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6e476f6d9435..b3081ece2e16 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4762,6 +4762,12 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
>  		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
>  					     cfs_rq->throttled_clock_task;
>  
> +		/*
> +		 * Last tg_unthrottle_up() may happen in a task_group being removed,
> +		 * it is only RCU protected so don't store it into leaf list.
> +		 */
> +		if (css_is_dying(&tg->css))
> +			return 0;

This, however, looks like band aid. I'd rather not call
tg_unthrottle_up() on a dying css.

>  		/* Add cfs_rq with load or one or more already running entities to the list */
>  		if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
>  			list_add_leaf_cfs_rq(cfs_rq);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 0e66749486e7..560151258d92 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -503,8 +503,8 @@ extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
>  extern struct task_group *sched_create_group(struct task_group *parent);
>  extern void sched_online_group(struct task_group *tg,
>  			       struct task_group *parent);
> -extern void sched_destroy_group(struct task_group *tg);
> -extern void sched_release_group(struct task_group *tg);
> +extern void sched_released_group(struct task_group *tg);
> +extern void sched_free_group(struct task_group *tg);
>  
>  extern void sched_move_task(struct task_struct *tsk);
>  
> 

Thanks,
Mathias