On Tue, Nov 08, 2022 at 03:07:42PM -0500, Mathieu Desnoyers wrote: > On 2022-11-08 08:04, Peter Zijlstra wrote: > > On Thu, Nov 03, 2022 at 04:03:43PM -0400, Mathieu Desnoyers wrote: > > > > > The credit goes to Paul Turner (Google) for the vcpu_id idea. This > > > feature is implemented based on the discussions with Paul Turner and > > > Peter Oskolkov (Google), but I took the liberty to implement scheduler > > > fast-path optimizations and my own NUMA-awareness scheme. The rumor has > > > it that Google have been running a rseq vcpu_id extension internally at > > > Google in production for a year. The tcmalloc source code indeed has > > > comments hinting at a vcpu_id prototype extension to the rseq system > > > call [1]. > > > > Re NUMA thing -- that means that on a 512 node system a single threaded > > task can still observe 512 separate vcpu-ids, right? > > Yes, that's correct. So,.. I've been thinking. How about we expose _two_ vcpu-ids :-) One is the original, system wide min(nr_thread, nr_cpus) and the other is a per-node vcpu-id. Not like you did, but min(nr_threads, nr_cpus_per_node) like. That way userspace can either use: - rseq::cpu_id -- as it does today - rseq::vcpu_id -- (when -1, see rseq::cpu_id) the original vcpu_id proposal, which is typically good enough when your application is not NUMA aware and relies on NUMA balancing (most every application out there) - rseq::node_id *and* rseq::vcpu_node_id -- Combined it gives both node locality and a *dense* space within the node. This then lets the application pick whatever is best. Note that using node affine memory allocations by default is a *very* bad idea since it wrecks NUMA balancing, you really should only use this if you application is fully NUMA aware and knows what it is doing. --- include/linux/mm.h | 15 ++-- include/linux/mm_types.h | 23 +----- include/linux/sched.h | 1 include/uapi/linux/rseq.h | 1 kernel/fork.c | 1 kernel/rseq.c | 9 ++ kernel/sched/core.c | 18 ++--- kernel/sched/sched.h | 157 +++++++++------------------------------------- 8 files changed, 66 insertions(+), 159 deletions(-) --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3484,6 +3484,10 @@ static inline int task_mm_vcpu_id(struct { return t->mm_vcpu; } +static inline int task_mm_vcpu_node_id(struct task_struct *t) +{ + return t->mm_vcpu_node; +} #else static inline void sched_vcpu_before_execve(struct task_struct *t) { } static inline void sched_vcpu_after_execve(struct task_struct *t) { } @@ -3491,12 +3495,11 @@ static inline void sched_vcpu_fork(struc static inline void sched_vcpu_exit_signals(struct task_struct *t) { } static inline int task_mm_vcpu_id(struct task_struct *t) { - /* - * Use the processor id as a fall-back when the mm vcpu feature is - * disabled. This provides functional per-cpu data structure accesses - * in user-space, althrough it won't provide the memory usage benefits. - */ - return raw_smp_processor_id(); + return -1; /* userspace should use cpu_id */ +} +static inline int task_mm_vcpu_node_id(struct task_struct *t) +{ + return -1; } #endif --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -568,7 +568,7 @@ struct mm_struct { * bitmap words as they were being concurrently updated by the * updaters. */ - spinlock_t vcpu_lock; + raw_spinlock_t vcpu_lock; #endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ @@ -875,28 +875,15 @@ static inline unsigned int mm_vcpumask_s #if defined(CONFIG_SCHED_MM_VCPU) && defined(CONFIG_NUMA) /* - * Layout of node vcpumasks: - * - node_alloc vcpumask: cpumask tracking which vcpu_id were - * allocated (across nodes) in this - * memory space. * - node vcpumask[nr_node_ids]: per-node cpumask tracking which vcpu_id * were allocated in this memory space. */ -static inline cpumask_t *mm_node_alloc_vcpumask(struct mm_struct *mm) +static inline cpumask_t *mm_node_vcpumask(struct mm_struct *mm, unsigned int node) { unsigned long vcpu_bitmap = (unsigned long)mm_vcpumask(mm); /* Skip mm_vcpumask */ vcpu_bitmap += cpumask_size(); - return (struct cpumask *)vcpu_bitmap; -} - -static inline cpumask_t *mm_node_vcpumask(struct mm_struct *mm, unsigned int node) -{ - unsigned long vcpu_bitmap = (unsigned long)mm_node_alloc_vcpumask(mm); - - /* Skip node alloc vcpumask */ - vcpu_bitmap += cpumask_size(); vcpu_bitmap += node * cpumask_size(); return (struct cpumask *)vcpu_bitmap; } @@ -907,21 +894,21 @@ static inline void mm_init_node_vcpumask if (num_possible_nodes() == 1) return; - cpumask_clear(mm_node_alloc_vcpumask(mm)); + for (node = 0; node < nr_node_ids; node++) cpumask_clear(mm_node_vcpumask(mm, node)); } static inline void mm_init_vcpu_lock(struct mm_struct *mm) { - spin_lock_init(&mm->vcpu_lock); + raw_spin_lock_init(&mm->vcpu_lock); } static inline unsigned int mm_node_vcpumask_size(void) { if (num_possible_nodes() == 1) return 0; - return (nr_node_ids + 1) * cpumask_size(); + return nr_node_ids * cpumask_size(); } #else static inline void mm_init_node_vcpumask(struct mm_struct *mm) { } --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1316,6 +1316,7 @@ struct task_struct { #ifdef CONFIG_SCHED_MM_VCPU int mm_vcpu; /* Current vcpu in mm */ + int mm_node_vcpu; /* Current vcpu for this node */ int mm_vcpu_active; /* Whether vcpu bitmap is active */ #endif --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -147,6 +147,7 @@ struct rseq { * (allocated uniquely within a memory space). */ __u32 vm_vcpu_id; + __u32 vm_vcpu_node_id; /* * Flexible array member at end of structure, after last feature field. --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1049,6 +1049,7 @@ static struct task_struct *dup_task_stru #ifdef CONFIG_SCHED_MM_VCPU tsk->mm_vcpu = -1; + tsk->mm_vcpu_node = -1; tsk->mm_vcpu_active = 0; #endif return tsk; --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -91,6 +91,7 @@ static int rseq_update_cpu_node_id(struc u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 vm_vcpu_id = task_mm_vcpu_id(t); + u32 vm_vcpu_node_id = task_mm_vcpu_node_id(t); WARN_ON_ONCE((int) vm_vcpu_id < 0); if (!user_write_access_begin(rseq, t->rseq_len)) @@ -99,6 +100,7 @@ static int rseq_update_cpu_node_id(struc unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); unsafe_put_user(node_id, &rseq->node_id, efault_end); unsafe_put_user(vm_vcpu_id, &rseq->vm_vcpu_id, efault_end); + unsafe_put_user(vm_vcpu_node_id, &rseq->vm_vcpu_node_id, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if @@ -116,8 +118,8 @@ static int rseq_update_cpu_node_id(struc static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - vm_vcpu_id = 0; + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + u32 node_id = 0, vm_vcpu_id = -1, vm_vcpu_node_id = -1; /* * Reset cpu_id_start to its initial state (0). @@ -141,6 +143,9 @@ static int rseq_reset_rseq_cpu_node_id(s */ if (put_user(vm_vcpu_id, &t->rseq->vm_vcpu_id)) return -EFAULT; + + if (put_user(vm_vcpu_node_id, &t->rseq->vm_vcpu_node_id)) + return -EFAULT; /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11278,15 +11278,13 @@ void call_trace_sched_update_nr_running( #ifdef CONFIG_SCHED_MM_VCPU void sched_vcpu_exit_signals(struct task_struct *t) { - struct mm_struct *mm = t->mm; unsigned long flags; - if (!mm) + if (!t->mm) return; + local_irq_save(flags); - mm_vcpu_put(mm, t->mm_vcpu); - t->mm_vcpu = -1; - t->mm_vcpu_active = 0; + mm_vcpu_put(t); local_irq_restore(flags); } @@ -11297,10 +11295,9 @@ void sched_vcpu_before_execve(struct tas if (!mm) return; + local_irq_save(flags); - mm_vcpu_put(mm, t->mm_vcpu); - t->mm_vcpu = -1; - t->mm_vcpu_active = 0; + mm_vcpu_put(t); local_irq_restore(flags); } @@ -11312,9 +11309,9 @@ void sched_vcpu_after_execve(struct task WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm); local_irq_save(flags); - t->mm_vcpu = mm_vcpu_get(mm); - t->mm_vcpu_active = 1; + mm_vcpu_get(t); local_irq_restore(flags); + rseq_set_notify_resume(t); } @@ -11322,6 +11319,7 @@ void sched_vcpu_fork(struct task_struct { WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm); t->mm_vcpu = -1; + t->mm_vcpu_node = -1; t->mm_vcpu_active = 1; } #endif --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3262,147 +3262,57 @@ static inline void update_current_exec_r } #ifdef CONFIG_SCHED_MM_VCPU -static inline int __mm_vcpu_get_single_node(struct mm_struct *mm) +static inline int __mm_vcpu_get(struct cpumask *cpumask) { - struct cpumask *cpumask; int vcpu; - cpumask = mm_vcpumask(mm); vcpu = cpumask_first_zero(cpumask); - if (vcpu >= nr_cpu_ids) + if (WARN_ON_ONCE(vcpu >= nr_cpu_ids)) return -1; + __cpumask_set_cpu(vcpu, cpumask); return vcpu; } -#ifdef CONFIG_NUMA -static inline bool mm_node_vcpumask_test_cpu(struct mm_struct *mm, int vcpu_id) -{ - if (num_possible_nodes() == 1) - return true; - return cpumask_test_cpu(vcpu_id, mm_node_vcpumask(mm, numa_node_id())); -} -static inline int __mm_vcpu_get(struct mm_struct *mm) +static inline void __mm_vcpu_put(struct task_struct *p, bool clear_active) { - struct cpumask *cpumask = mm_vcpumask(mm), - *node_cpumask = mm_node_vcpumask(mm, numa_node_id()), - *node_alloc_cpumask = mm_node_alloc_vcpumask(mm); - unsigned int node; - int vcpu; - - if (num_possible_nodes() == 1) - return __mm_vcpu_get_single_node(mm); - - /* - * Try to reserve lowest available vcpu number within those already - * reserved for this NUMA node. - */ - vcpu = cpumask_first_andnot(node_cpumask, cpumask); - if (vcpu >= nr_cpu_ids) - goto alloc_numa; - __cpumask_set_cpu(vcpu, cpumask); - goto end; + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(p->mm_vcpu < 0); -alloc_numa: - /* - * Try to reserve lowest available vcpu number within those not already - * allocated for numa nodes. - */ - vcpu = cpumask_first_notandnot(node_alloc_cpumask, cpumask); - if (vcpu >= nr_cpu_ids) - goto numa_update; - __cpumask_set_cpu(vcpu, cpumask); - __cpumask_set_cpu(vcpu, node_cpumask); - __cpumask_set_cpu(vcpu, node_alloc_cpumask); - goto end; - -numa_update: - /* - * NUMA node id configuration changed for at least one CPU in the system. - * We need to steal a currently unused vcpu_id from an overprovisioned - * node for our current node. Userspace must handle the fact that the - * node id associated with this vcpu_id may change due to node ID - * reconfiguration. - * - * Count how many possible cpus are attached to each (other) node id, - * and compare this with the per-mm node vcpumask cpu count. Find one - * which has too many cpus in its mask to steal from. - */ - for (node = 0; node < nr_node_ids; node++) { - struct cpumask *iter_cpumask; - - if (node == numa_node_id()) - continue; - iter_cpumask = mm_node_vcpumask(mm, node); - if (nr_cpus_node(node) < cpumask_weight(iter_cpumask)) { - /* Try to steal from this node. */ - vcpu = cpumask_first_andnot(iter_cpumask, cpumask); - if (vcpu >= nr_cpu_ids) - goto steal_fail; - __cpumask_set_cpu(vcpu, cpumask); - __cpumask_clear_cpu(vcpu, iter_cpumask); - __cpumask_set_cpu(vcpu, node_cpumask); - goto end; - } - } + raw_spin_lock(&mm->vcpu_lock); -steal_fail: - /* - * Our attempt at gracefully stealing a vcpu_id from another - * overprovisioned NUMA node failed. Fallback to grabbing the first - * available vcpu_id. - */ - vcpu = cpumask_first_zero(cpumask); - if (vcpu >= nr_cpu_ids) - return -1; - __cpumask_set_cpu(vcpu, cpumask); - /* Steal vcpu from its numa node mask. */ - for (node = 0; node < nr_node_ids; node++) { - struct cpumask *iter_cpumask; - - if (node == numa_node_id()) - continue; - iter_cpumask = mm_node_vcpumask(mm, node); - if (cpumask_test_cpu(vcpu, iter_cpumask)) { - __cpumask_clear_cpu(vcpu, iter_cpumask); - break; - } - } - __cpumask_set_cpu(vcpu, node_cpumask); -end: - return vcpu; -} + __cpumask_clear_cpu(p->mm_vcpu, mm_vcpumask(mm)); + p->mm_vcpu = -1; +#ifdef CONFIG_NUMA + __cpumask_clear_cpu(p->mm_vcpu_node, mm_node_vcpumask(mm, task_node(p))); + p->mm_vcpu_node = -1; +#endif + if (clear_active) + p->mm_vcpu_active = 0; -#else -static inline bool mm_node_vcpumask_test_cpu(struct mm_struct *mm, int vcpu_id) -{ - return true; -} -static inline int __mm_vcpu_get(struct mm_struct *mm) -{ - return __mm_vcpu_get_single_node(mm); + raw_spin_unlock(&mm->vcpu_lock); } -#endif -static inline void mm_vcpu_put(struct mm_struct *mm, int vcpu) +static inline void mm_vcpu_put(struct task_struct *p, bool clear_active) { - lockdep_assert_irqs_disabled(); - if (vcpu < 0) - return; - spin_lock(&mm->vcpu_lock); - __cpumask_clear_cpu(vcpu, mm_vcpumask(mm)); - spin_unlock(&mm->vcpu_lock); + __mm_vcpu_put(p, true); } -static inline int mm_vcpu_get(struct mm_struct *mm) +static inline void mm_vcpu_get(struct task_struct *p) { int ret; lockdep_assert_irqs_disabled(); - spin_lock(&mm->vcpu_lock); - ret = __mm_vcpu_get(mm); - spin_unlock(&mm->vcpu_lock); + WARN_ON_ONCE(p->mm_vcpu > 0); + + raw_spin_lock(&mm->vcpu_lock); + p->mm_vcpu = __mm_vcpu_get(mm_vcpumask(m)); +#ifdef CONFIG_NUMA + p->mm_vcpu_node = __mm_vcpu_get(mm_node_vcpumask(mm, task_node(p))); +#endif + p->mm_vcpu_active = 1; + raw_spin_unlock(&mm->vcpu_lock); return ret; } @@ -3414,15 +3324,16 @@ static inline void switch_mm_vcpu(struct * Context switch between threads in same mm, hand over * the mm_vcpu from prev to next. */ - next->mm_vcpu = prev->mm_vcpu; - prev->mm_vcpu = -1; + swap(next->mm_vcpu, prev->mm_vcpu); +#ifdef CONFIG_NUMA + swap(next->mm_vcpu_node, prev->mm_vcpu_node); +#endif return; } - mm_vcpu_put(prev->mm, prev->mm_vcpu); - prev->mm_vcpu = -1; + __mm_vcpu_put(prev, false); } if (next->mm_vcpu_active) - next->mm_vcpu = mm_vcpu_get(next->mm); + mm_vcpu_get(next); } #else