Re: [PATCH RESEND v8 1/2] sched/numa: introduce per-cgroup NUMA locality info

王贇 <yun.wang@xxxxxxxxxxxxxxxxx> · Thu, 13 Feb 2020 10:35:41 +0800

Hi, Peter

Could you please give some comments on this one?

Regards,
Michael Wang

On 2020/2/7 上午11:37, 王贇 wrote:
> Forwarded:
> 
> Hi, Peter, Ingo
> 
> Could you give some comments please?
> 
> As Mel replied previously, he won't disagree the idea, so we're looking
> forward the opinion from the maintainers.
> 
> Please allow me to highlight the necessary of monitoring NUMA Balancing
> again, this feature is critical to the performance on NUMA platform,
> it cost and benefit -- lot or less, however there are not enough
> information for an admin to analysis the trade-off, while locality could
> be the missing piece.
> 
> Regards,
> Michael Wang
> 
> On 2020/2/7 上午11:35, 王贇 wrote:
>> Currently there are no good approach to monitoring the per-cgroup NUMA
>> efficiency, this could be a trouble especially when groups are sharing
>> CPUs, we don't know which one introduced remote-memory accessing.
>>
>> Although the per-task NUMA accessing info from PMU is good for further
>> debuging, but not light enough for daily monitoring, especial on a box
>> with thousands of tasks.
>>
>> Fortunately, when NUMA Balancing enabled, it will periodly trigger page
>> fault and try to increase the NUMA locality, by tracing the results we
>> will be able to estimate the NUMA efficiency.
>>
>> On each page fault of NUMA Balancing, when task's executing CPU is from
>> the same node of pages, we call this a local page accessing, otherwise
>> a remote page accessing.
>>
>> By updating task's accessing counter into it's cgroup on ticks, we get
>> the per-cgroup numa locality info.
>>
>> For example the new entry 'cpu.numa_stat' show:
>>   page_access local=1231412 remote=53453
>>
>> Here we know the workloads in hierarchy have totally been traced 1284865
>> times of page accessing, and 1231412 of them are local page access, which
>> imply a good NUMA efficiency.
>>
>> By monitoring the increments, we will be able to locate the per-cgroup
>> workload which NUMA Balancing can't helpwith (usually caused by wrong
>> CPU and memory node bindings), then we got chance to fix that in time.
>>
>> Cc: Mel Gorman <mgorman@xxxxxxx>
>> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>> Cc: Michal Koutný <mkoutny@xxxxxxxx>
>> Signed-off-by: Michael Wang <yun.wang@xxxxxxxxxxxxxxxxx>
>> ---
>>  include/linux/sched.h        | 15 +++++++++
>>  include/linux/sched/sysctl.h |  6 ++++
>>  init/Kconfig                 |  9 ++++++
>>  kernel/sched/core.c          | 75 ++++++++++++++++++++++++++++++++++++++++++++
>>  kernel/sched/fair.c          | 62 ++++++++++++++++++++++++++++++++++++
>>  kernel/sched/sched.h         | 12 +++++++
>>  kernel/sysctl.c              | 11 +++++++
>>  7 files changed, 190 insertions(+)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index a6c924fa1c77..74bf234bae53 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1128,6 +1128,21 @@ struct task_struct {
>>  	unsigned long			numa_pages_migrated;
>>  #endif /* CONFIG_NUMA_BALANCING */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	/*
>> +	 * Counter index stand for:
>> +	 * 0 -- remote page accessing
>> +	 * 1 -- local page accessing
>> +	 * 2 -- remote page accessing updated to cgroup
>> +	 * 3 -- local page accessing updated to cgroup
>> +	 *
>> +	 * We record the counter before the end of task_numa_fault(), this
>> +	 * is based on the fact that after page fault is handled, the task
>> +	 * will access the page on the CPU where it triggered the PF.
>> +	 */
>> +	unsigned long			numa_page_access[4];
>> +#endif
>> +
>>  #ifdef CONFIG_RSEQ
>>  	struct rseq __user *rseq;
>>  	u32 rseq_sig;
>> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
>> index d4f6215ee03f..bb3721cf48e0 100644
>> --- a/include/linux/sched/sysctl.h
>> +++ b/include/linux/sched/sysctl.h
>> @@ -101,4 +101,10 @@ extern int sched_energy_aware_handler(struct ctl_table *table, int write,
>>  				 loff_t *ppos);
>>  #endif
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +extern int sysctl_numa_locality(struct ctl_table *table, int write,
>> +				 void __user *buffer, size_t *lenp,
>> +				 loff_t *ppos);
>> +#endif
>> +
>>  #endif /* _LINUX_SCHED_SYSCTL_H */
>> diff --git a/init/Kconfig b/init/Kconfig
>> index 322fd2c65304..63c6b90a515d 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -813,6 +813,15 @@ config NUMA_BALANCING_DEFAULT_ENABLED
>>  	  If set, automatic NUMA balancing will be enabled if running on a NUMA
>>  	  machine.
>>
>> +config CGROUP_NUMA_LOCALITY
>> +	bool "per-cgroup NUMA Locality"
>> +	default n
>> +	depends on CGROUP_SCHED && NUMA_BALANCING
>> +	help
>> +	  This option enables the collection of per-cgroup NUMA locality info,
>> +	  to tell whether NUMA Balancing is working well for a particular
>> +	  workload, also imply the NUMA efficiency.
>> +
>>  menuconfig CGROUPS
>>  	bool "Control Group support"
>>  	select KERNFS
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index e7b08d52db93..40dd6b221eef 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -7657,6 +7657,68 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
>>  }
>>  #endif /* CONFIG_RT_GROUP_SCHED */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +DEFINE_STATIC_KEY_FALSE(sched_numa_locality);
>> +
>> +#ifdef CONFIG_PROC_SYSCTL
>> +int sysctl_numa_locality(struct ctl_table *table, int write,
>> +			 void __user *buffer, size_t *lenp, loff_t *ppos)
>> +{
>> +	struct ctl_table t;
>> +	int err;
>> +	int state = static_branch_likely(&sched_numa_locality);
>> +
>> +	if (write && !capable(CAP_SYS_ADMIN))
>> +		return -EPERM;
>> +
>> +	t = *table;
>> +	t.data = &state;
>> +	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
>> +	if (err < 0 || !write)
>> +		return err;
>> +
>> +	if (state)
>> +		static_branch_enable(&sched_numa_locality);
>> +	else
>> +		static_branch_disable(&sched_numa_locality);
>> +
>> +	return err;
>> +}
>> +#endif
>> +
>> +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu)
>> +{
>> +	return tg == &root_task_group ? &cpu_rq(cpu)->cfs : tg->cfs_rq[cpu];
>> +}
>> +
>> +static int cpu_numa_stat_show(struct seq_file *sf, void *v)
>> +{
>> +	int cpu;
>> +	u64 local = 0, remote = 0;
>> +	struct task_group *tg = css_tg(seq_css(sf));
>> +
>> +	if (!static_branch_likely(&sched_numa_locality))
>> +		return 0;
>> +
>> +	for_each_possible_cpu(cpu) {
>> +		local += tg_cfs_rq(tg, cpu)->local_page_access;
>> +		remote += tg_cfs_rq(tg, cpu)->remote_page_access;
>> +	}
>> +
>> +	seq_printf(sf, "page_access local=%llu remote=%llu\n", local, remote);
>> +
>> +	return 0;
>> +}
>> +
>> +static __init int numa_locality_setup(char *opt)
>> +{
>> +	static_branch_enable(&sched_numa_locality);
>> +
>> +	return 0;
>> +}
>> +__setup("numa_locality", numa_locality_setup);
>> +#endif
>> +
>>  static struct cftype cpu_legacy_files[] = {
>>  #ifdef CONFIG_FAIR_GROUP_SCHED
>>  	{
>> @@ -7706,6 +7768,12 @@ static struct cftype cpu_legacy_files[] = {
>>  		.seq_show = cpu_uclamp_max_show,
>>  		.write = cpu_uclamp_max_write,
>>  	},
>> +#endif
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	{
>> +		.name = "numa_stat",
>> +		.seq_show = cpu_numa_stat_show,
>> +	},
>>  #endif
>>  	{ }	/* Terminate */
>>  };
>> @@ -7887,6 +7955,13 @@ static struct cftype cpu_files[] = {
>>  		.seq_show = cpu_uclamp_max_show,
>>  		.write = cpu_uclamp_max_write,
>>  	},
>> +#endif
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	{
>> +		.name = "numa_stat",
>> +		.flags = CFTYPE_NOT_ON_ROOT,
>> +		.seq_show = cpu_numa_stat_show,
>> +	},
>>  #endif
>>  	{ }	/* terminate */
>>  };
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 2d170b5da0e3..eb838557bae2 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -1049,7 +1049,63 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>   * Scheduling class queueing methods:
>>   */
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +/*
>> + * We want to record the real local/remote page access statistic
>> + * here, so 'pnid' should be pages's real residential node after
>> + * migrate_misplaced_page(), and 'cnid' should be the node of CPU
>> + * where triggered the PF.
>> + */
>> +static inline void
>> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
>> +{
>> +	if (!static_branch_unlikely(&sched_numa_locality))
>> +		return;
>> +
>> +	/*
>> +	 * pnid != cnid --> remote idx 0
>> +	 * pnid == cnid --> local idx 1
>> +	 */
>> +	p->numa_page_access[!!(pnid == cnid)] += pages;
>> +}
>> +
>> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
>> +{
>> +	unsigned long ldiff, rdiff;
>> +
>> +	if (!static_branch_unlikely(&sched_numa_locality))
>> +		return;
>> +
>> +	rdiff = current->numa_page_access[0] - current->numa_page_access[2];
>> +	ldiff = current->numa_page_access[1] - current->numa_page_access[3];
>> +	if (!ldiff && !rdiff)
>> +		return;
>> +
>> +	cfs_rq->local_page_access += ldiff;
>> +	cfs_rq->remote_page_access += rdiff;
>> +
>> +	/*
>> +	 * Consider updated when reach root cfs_rq, no NUMA Balancing PF
>> +	 * should happen on current task during the hierarchical updating.
>> +	 */
>> +	if (&cfs_rq->rq->cfs == cfs_rq) {
>> +		current->numa_page_access[2] = current->numa_page_access[0];
>> +		current->numa_page_access[3] = current->numa_page_access[1];
>> +	}
>> +}
>> +#else
>> +static inline void
>> +update_task_locality(struct task_struct *p, int pnid, int cnid, int pages)
>> +{
>> +}
>> +
>> +static inline void update_group_locality(struct cfs_rq *cfs_rq)
>> +{
>> +}
>> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
>> +
>>  #ifdef CONFIG_NUMA_BALANCING
>> +
>>  /*
>>   * Approximate time to scan a full NUMA task in ms. The task scan period is
>>   * calculated based on the tasks virtual memory size and
>> @@ -2465,6 +2521,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>>  	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
>>  	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
>>  	p->numa_faults_locality[local] += pages;
>> +
>> +	update_task_locality(p, mem_node, numa_node_id(), pages);
>>  }
>>
>>  static void reset_ptenuma_scan(struct task_struct *p)
>> @@ -2650,6 +2708,9 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
>>  	p->last_sum_exec_runtime	= 0;
>>
>>  	init_task_work(&p->numa_work, task_numa_work);
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	memset(p->numa_page_access, 0, sizeof(p->numa_page_access));
>> +#endif
>>
>>  	/* New address space, reset the preferred nid */
>>  	if (!(clone_flags & CLONE_VM)) {
>> @@ -4313,6 +4374,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>>  	 */
>>  	update_load_avg(cfs_rq, curr, UPDATE_TG);
>>  	update_cfs_group(curr);
>> +	update_group_locality(cfs_rq);
>>
>>  #ifdef CONFIG_SCHED_HRTICK
>>  	/*
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 1a88dc8ad11b..66b4e581b6ed 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -575,6 +575,14 @@ struct cfs_rq {
>>  	struct list_head	throttled_list;
>>  #endif /* CONFIG_CFS_BANDWIDTH */
>>  #endif /* CONFIG_FAIR_GROUP_SCHED */
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	/*
>> +	 * The local/remote page access info collected from all
>> +	 * the tasks in hierarchy.
>> +	 */
>> +	u64			local_page_access;
>> +	u64			remote_page_access;
>> +#endif
>>  };
>>
>>  static inline int rt_bandwidth_enabled(void)
>> @@ -1601,6 +1609,10 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
>>  extern struct static_key_false sched_numa_balancing;
>>  extern struct static_key_false sched_schedstats;
>>
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +extern struct static_key_false sched_numa_locality;
>> +#endif
>> +
>>  static inline u64 global_rt_period(void)
>>  {
>>  	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>> index d396aaaf19a3..a8f5951f92b3 100644
>> --- a/kernel/sysctl.c
>> +++ b/kernel/sysctl.c
>> @@ -428,6 +428,17 @@ static struct ctl_table kern_table[] = {
>>  		.extra2		= SYSCTL_ONE,
>>  	},
>>  #endif /* CONFIG_NUMA_BALANCING */
>> +#ifdef CONFIG_CGROUP_NUMA_LOCALITY
>> +	{
>> +		.procname	= "numa_locality",
>> +		.data		= NULL, /* filled in by handler */
>> +		.maxlen		= sizeof(unsigned int),
>> +		.mode		= 0644,
>> +		.proc_handler	= sysctl_numa_locality,
>> +		.extra1		= SYSCTL_ZERO,
>> +		.extra2		= SYSCTL_ONE,
>> +	},
>> +#endif /* CONFIG_CGROUP_NUMA_LOCALITY */
>>  #endif /* CONFIG_SCHED_DEBUG */
>>  	{
>>  		.procname	= "sched_rt_period_us",
>>