Re: [RFC PATCH 2/3] psi: cgroup v1 support

Joseph Qi <joseph.qi@xxxxxxxxxxxxxxxxx> · Wed, 5 Jun 2019 09:15:33 +0800

Hi Johannes,

Thanks for the quick comments.

On 19/6/4 19:55, Johannes Weiner wrote:
> On Tue, Jun 04, 2019 at 09:57:44AM +0800, Joseph Qi wrote:
>> Implements pressure stall tracking for cgroup v1.
>>
>> Signed-off-by: Joseph Qi <joseph.qi@xxxxxxxxxxxxxxxxx>
>> ---
>>  kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 56 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 7acc632c3b82..909083c828d5 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>>  	return state_mask;
>>  }
>>  
>> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
>> +static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
>> +{
>> +	switch (res) {
>> +	case NR_PSI_RESOURCES:
>> +		return task_dfl_cgroup(task);
>> +	case PSI_IO:
>> +		return task_cgroup(task, io_cgrp_subsys.id);
>> +	case PSI_MEM:
>> +		return task_cgroup(task, memory_cgrp_subsys.id);
>> +	case PSI_CPU:
>> +		return task_cgroup(task, cpu_cgrp_subsys.id);
>> +	default:  /* won't reach here */
>> +		return NULL;
>> +	}
>> +}
>> +
>> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
>> +					enum psi_res res)
>>  {
>>  #ifdef CONFIG_CGROUPS
>>  	struct cgroup *cgroup = NULL;
>>  
>>  	if (!*iter)
>> -		cgroup = task->cgroups->dfl_cgrp;
>> +		cgroup = psi_task_cgroup(task, res);
>>  	else if (*iter == &psi_system)
>>  		return NULL;
>>  	else
>> @@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>>  		     wq_worker_last_func(task) == psi_avgs_work))
>>  		wake_clock = false;
>>  
>> -	while ((group = iterate_groups(task, &iter))) {
>> -		u32 state_mask = psi_group_change(group, cpu, clear, set);
>> +	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
>> +	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
>> +	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
>> +		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
>> +			u32 state_mask = psi_group_change(group, cpu, clear, set);
>>  
>> -		if (state_mask & group->poll_states)
>> -			psi_schedule_poll_work(group, 1);
>> +			if (state_mask & group->poll_states)
>> +				psi_schedule_poll_work(group, 1);
>>  
>> -		if (wake_clock && !delayed_work_pending(&group->avgs_work))
>> -			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>> +			if (wake_clock && !delayed_work_pending(&group->avgs_work))
>> +				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>> +		}
>> +	} else {
>> +		enum psi_task_count i;
>> +		enum psi_res res;
>> +		int psi_flags = clear | set;
>> +
>> +		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
>> +			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
>> +				res = PSI_IO;
>> +			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
>> +				res = PSI_MEM;
>> +			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
>> +				res = PSI_CPU;
>> +			else
>> +				continue;
>> +
>> +			while ((group = iterate_groups(task, &iter, res))) {
>> +				u32 state_mask = psi_group_change(group, cpu, clear, set);
> 
> This doesn't work. Each resource state is composed of all possible
> task states:
> 
> static bool test_state(unsigned int *tasks, enum psi_states state)
> {
> 	switch (state) {
> 	case PSI_IO_SOME:
> 		return tasks[NR_IOWAIT];
> 	case PSI_IO_FULL:
> 		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
> 	case PSI_MEM_SOME:
> 		return tasks[NR_MEMSTALL];
> 	case PSI_MEM_FULL:
> 		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
> 	case PSI_CPU_SOME:
> 		return tasks[NR_RUNNING] > 1;
> 	case PSI_NONIDLE:
> 		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
> 			tasks[NR_RUNNING];
> 	default:
> 		return false;
> 	}
> }
> 
> So the IO controller needs to know of NR_RUNNING to tell some vs full,
> the memory controller needs to know of NR_IOWAIT to tell nonidle etc.
> 
> You need to run the full psi task tracking and aggregation machinery
> separately for each of the different cgroups a task can be in in v1.
> 
Yes, since different controllers have their own hierarchy.

> Needless to say, that is expensive. For cpu, memory and io, it's
> triple the scheduling overhead with three ancestor walks and three
> times the cache footprint; three times more aggregation workers every
> two seconds... We could never turn this on per default.
> 
IC, but even on cgroup v2, would it still be expensive if we have many
cgroups?

> Have you considered just co-mounting cgroup2, if for nothing else, to
> get the pressure numbers?
> 
Do you mean mounting cgroup1 and cgroup2 at the same time? 
IIUC, this may not work since many cgroup code have xxx_on_dfl check.

Thanks,
Joseph