Re: [RFC PATCH 2/3] psi: cgroup v1 support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jun 04, 2019 at 09:57:44AM +0800, Joseph Qi wrote:
> Implements pressure stall tracking for cgroup v1.
> 
> Signed-off-by: Joseph Qi <joseph.qi@xxxxxxxxxxxxxxxxx>
> ---
>  kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 56 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 7acc632c3b82..909083c828d5 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>  	return state_mask;
>  }
>  
> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> +static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
> +{
> +	switch (res) {
> +	case NR_PSI_RESOURCES:
> +		return task_dfl_cgroup(task);
> +	case PSI_IO:
> +		return task_cgroup(task, io_cgrp_subsys.id);
> +	case PSI_MEM:
> +		return task_cgroup(task, memory_cgrp_subsys.id);
> +	case PSI_CPU:
> +		return task_cgroup(task, cpu_cgrp_subsys.id);
> +	default:  /* won't reach here */
> +		return NULL;
> +	}
> +}
> +
> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
> +					enum psi_res res)
>  {
>  #ifdef CONFIG_CGROUPS
>  	struct cgroup *cgroup = NULL;
>  
>  	if (!*iter)
> -		cgroup = task->cgroups->dfl_cgrp;
> +		cgroup = psi_task_cgroup(task, res);
>  	else if (*iter == &psi_system)
>  		return NULL;
>  	else
> @@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>  		     wq_worker_last_func(task) == psi_avgs_work))
>  		wake_clock = false;
>  
> -	while ((group = iterate_groups(task, &iter))) {
> -		u32 state_mask = psi_group_change(group, cpu, clear, set);
> +	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
> +		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
> +			u32 state_mask = psi_group_change(group, cpu, clear, set);
>  
> -		if (state_mask & group->poll_states)
> -			psi_schedule_poll_work(group, 1);
> +			if (state_mask & group->poll_states)
> +				psi_schedule_poll_work(group, 1);
>  
> -		if (wake_clock && !delayed_work_pending(&group->avgs_work))
> -			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +			if (wake_clock && !delayed_work_pending(&group->avgs_work))
> +				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +		}
> +	} else {
> +		enum psi_task_count i;
> +		enum psi_res res;
> +		int psi_flags = clear | set;
> +
> +		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
> +			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
> +				res = PSI_IO;
> +			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
> +				res = PSI_MEM;
> +			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
> +				res = PSI_CPU;
> +			else
> +				continue;
> +
> +			while ((group = iterate_groups(task, &iter, res))) {
> +				u32 state_mask = psi_group_change(group, cpu, clear, set);

This doesn't work. Each resource state is composed of all possible
task states:

static bool test_state(unsigned int *tasks, enum psi_states state)
{
	switch (state) {
	case PSI_IO_SOME:
		return tasks[NR_IOWAIT];
	case PSI_IO_FULL:
		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
	case PSI_MEM_SOME:
		return tasks[NR_MEMSTALL];
	case PSI_MEM_FULL:
		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
	case PSI_CPU_SOME:
		return tasks[NR_RUNNING] > 1;
	case PSI_NONIDLE:
		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
			tasks[NR_RUNNING];
	default:
		return false;
	}
}

So the IO controller needs to know of NR_RUNNING to tell some vs full,
the memory controller needs to know of NR_IOWAIT to tell nonidle etc.

You need to run the full psi task tracking and aggregation machinery
separately for each of the different cgroups a task can be in in v1.

Needless to say, that is expensive. For cpu, memory and io, it's
triple the scheduling overhead with three ancestor walks and three
times the cache footprint; three times more aggregation workers every
two seconds... We could never turn this on per default.

Have you considered just co-mounting cgroup2, if for nothing else, to
get the pressure numbers?




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux