On Mon 05-12-22 00:14:29, Zhongkun He wrote: > All vma manipulation is somewhat protected by a down_read on > mmap_lock, so vma mempolicy is clear to obtain a reference. > But there is no locking in process context and have a mix > of reference counting and per-task requirements which is rather > subtle and easy to get wrong. > > we would have get_vma_policy() always returning a reference > counted policy, except for static policy. For better performance, > we replace atomic_t ref with percpu_ref in mempolicy, which is > usually the performance bottleneck in hot path. > > This series adjust the reference of mempolicy in process context, > which will be protected by RCU in read hot path. Besides, > task->mempolicy is also protected by task_lock(). Percpu_ref > is a good way to reduce cache line bouncing. > > The mpol_get/put() can just increment or decrement the local > counter. Mpol_kill() must be called to initiate the destruction > of mempolicy. A mempolicy will be freed when the mpol_kill() > is called and the reference count decrese to zero. This is really hard to follow. Without having the context from previous discussions I would be completely lost. Please structure your cover letter but also other patch in general in the form: - what is the problem you would like to deal with - want to introduce pidfd_set_mempolicy because XYZ - what stands in the way - mempolicy objects access constrains (reliance on operating in the current context) - reference counting needs to be unconditional - why regular reference counting is not sufficient (performance) - what is this patchset proposing - per cpu reference counting - how is it implemented - how is the patch series structured - make the reference counting unconditional - special case static (never released) policies - replace standard ref counting by per-cpu reference counting - how has this been tested? Makes sense? > Suggested-by: Michal Hocko <mhocko@xxxxxxxx> > Signed-off-by: Zhongkun He <hezhongkun.hzk@xxxxxxxxxxxxx> > --- > include/linux/mempolicy.h | 65 +++++++++++++++++++------------ > include/uapi/linux/mempolicy.h | 2 +- > mm/mempolicy.c | 71 ++++++++++++++++++++++------------ > 3 files changed, 89 insertions(+), 49 deletions(-) > Is the following the cumulative diff? > diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h > index d232de7cdc56..9178b008eadf 100644 > --- a/include/linux/mempolicy.h > +++ b/include/linux/mempolicy.h > @@ -28,12 +28,16 @@ struct mm_struct; > * of the current process. > * > * Locking policy for interleave: > - * In process context there is no locking because only the process accesses > - * its own state. All vma manipulation is somewhat protected by a down_read on > + * percpu_ref is used to reduce cache line bouncing. > + * In process context we should obtain a reference via mpol_get() > + * protected by RCU. > + * All vma manipulation is somewhat protected by a down_read on > * mmap_lock. > * > * Freeing policy: > - * Mempolicy objects are reference counted. A mempolicy will be freed when > + * Mempolicy objects are reference counted. The mpol_get/put can just increment > + * or decrement the local counter. Mpol_kill() must be called to initiate the > + * destruction of mempolicy. A mempolicy will be freed when mpol_kill()/ > * mpol_put() decrements the reference count to zero. > * > * Duplicating policy objects: > @@ -42,43 +46,36 @@ struct mm_struct; > * to 1, representing the caller of mpol_dup(). > */ > struct mempolicy { > - atomic_t refcnt; > - unsigned short mode; /* See MPOL_* above */ > + struct percpu_ref refcnt; /* reduce cache line bouncing */ > + unsigned short mode; /* See MPOL_* above */ > unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ > + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ > nodemask_t nodes; /* interleave/bind/perfer */ > - int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ > > union { > nodemask_t cpuset_mems_allowed; /* relative to these nodes */ > nodemask_t user_nodemask; /* nodemask passed by user */ > + struct rcu_head rcu; /* used for freeing in an RCU-safe manner */ > } w; > }; > > /* > - * Support for managing mempolicy data objects (clone, copy, destroy) > - * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. > + * Mempolicy pol need explicit unref after use, except for > + * static policies(default_policy, preferred_node_policy). > */ > - > -extern void __mpol_put(struct mempolicy *pol); > -static inline void mpol_put(struct mempolicy *pol) > +static inline int mpol_needs_cond_ref(struct mempolicy *pol) > { > - if (pol) > - __mpol_put(pol); > + return (pol && !(pol->flags & MPOL_F_STATIC)); > } > > /* > - * Does mempolicy pol need explicit unref after use? > - * Currently only needed for shared policies. > + * Put a mpol reference obtained via mpol_get(). > */ > -static inline int mpol_needs_cond_ref(struct mempolicy *pol) > -{ > - return (pol && (pol->flags & MPOL_F_SHARED)); > -} > > -static inline void mpol_cond_put(struct mempolicy *pol) > +static inline void mpol_put(struct mempolicy *pol) > { > if (mpol_needs_cond_ref(pol)) > - __mpol_put(pol); > + percpu_ref_put(&pol->refcnt); > } > > extern struct mempolicy *__mpol_dup(struct mempolicy *pol); > @@ -91,12 +88,28 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) > > #define vma_policy(vma) ((vma)->vm_policy) > > +/* Obtain a reference on the specified mpol */ > static inline void mpol_get(struct mempolicy *pol) > { > if (pol) > - atomic_inc(&pol->refcnt); > + percpu_ref_get(&pol->refcnt); > +} > + > +static inline bool mpol_tryget(struct mempolicy *pol) > +{ > + return pol && percpu_ref_tryget(&pol->refcnt); > } > > +/* > + * This function initiates destruction of mempolicy. > + */ > +static inline void mpol_kill(struct mempolicy *pol) > +{ > + if (pol) > + percpu_ref_kill(&pol->refcnt); > +} > + > + > extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); > static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) > { > @@ -197,11 +210,15 @@ static inline void mpol_put(struct mempolicy *p) > { > } > > -static inline void mpol_cond_put(struct mempolicy *pol) > +static inline void mpol_get(struct mempolicy *pol) > { > } > > -static inline void mpol_get(struct mempolicy *pol) > +static inline bool mpol_tryget(struct mempolicy *pol) > +{ > +} > + > +static inline void mpol_kill(struct mempolicy *pol) > { > } > > diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h > index 046d0ccba4cd..940e1a88a4da 100644 > --- a/include/uapi/linux/mempolicy.h > +++ b/include/uapi/linux/mempolicy.h > @@ -60,7 +60,7 @@ enum { > * "mode flags". These flags are allocated from bit 0 up, as they > * are never OR'ed into the mode in mempolicy API arguments. > */ > -#define MPOL_F_SHARED (1 << 0) /* identify shared policies */ > +#define MPOL_F_STATIC (1 << 0) /* identify static policies(e.g default_policy) */ > #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ > #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 61aa9aedb728..ee3e2ed5ef07 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -124,8 +124,8 @@ enum zone_type policy_zone = 0; > * run-time system-wide default policy => local allocation > */ > static struct mempolicy default_policy = { > - .refcnt = ATOMIC_INIT(1), /* never free it */ > .mode = MPOL_LOCAL, > + .flags = MPOL_F_STATIC, > }; > > static struct mempolicy preferred_node_policy[MAX_NUMNODES]; > @@ -158,9 +158,32 @@ int numa_map_to_online_node(int node) > } > EXPORT_SYMBOL_GPL(numa_map_to_online_node); > > +/* Obtain a reference on the specified task mempolicy */ > +static mempolicy *get_task_mpol(struct task_struct *p) > +{ > + struct mempolicy *pol; > + > + rcu_read_lock(); > + pol = rcu_dereference(p->mempolicy); > + > + if (!pol || mpol_tryget(pol)) > + pol = NULL; > + rcu_read_unlock(); > + > + return pol; > +} > + > +static void mpol_release(struct percpu_ref *ref) > +{ > + struct mempolicy *mpol = container_of(ref, struct mempolicy, refcnt); > + > + percpu_ref_exit(ref); > + kfree_rcu(mpol, w.rcu); > +} > + > struct mempolicy *get_task_policy(struct task_struct *p) > { > - struct mempolicy *pol = p->mempolicy; > + struct mempolicy *pol = get_task_mpol(p); > int node; > > if (pol) > @@ -296,7 +319,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, > policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); > if (!policy) > return ERR_PTR(-ENOMEM); > - atomic_set(&policy->refcnt, 1); > + > + if (percpu_ref_init(&policy->refcnt, mpol_release, 0, > + GFP_KERNEL)) { > + kmem_cache_free(policy_cache, policy); > + return ERR_PTR(-ENOMEM); > + } > policy->mode = mode; > policy->flags = flags; > policy->home_node = NUMA_NO_NODE; > @@ -304,14 +332,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, > return policy; > } > > -/* Slow path of a mpol destructor. */ > -void __mpol_put(struct mempolicy *p) > -{ > - if (!atomic_dec_and_test(&p->refcnt)) > - return; > - kmem_cache_free(policy_cache, p); > -} > - > static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) > { > } > @@ -1759,14 +1779,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, > } else if (vma->vm_policy) { > pol = vma->vm_policy; > > - /* > - * shmem_alloc_page() passes MPOL_F_SHARED policy with > - * a pseudo vma whose vma->vm_ops=NULL. Take a reference > - * count on these policies which will be dropped by > - * mpol_cond_put() later > - */ > - if (mpol_needs_cond_ref(pol)) > - mpol_get(pol); > + /* vma policy is protected by mmap_lock. */ > + mpol_get(pol); > } > } > > @@ -2423,7 +2437,13 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) > nodemask_t mems = cpuset_mems_allowed(current); > mpol_rebind_policy(new, &mems); > } > - atomic_set(&new->refcnt, 1); > + > + if (percpu_ref_init(&new->refcnt, mpol_release, 0, > + GFP_KERNEL)) { > + kmem_cache_free(policy_cache, new); > + return ERR_PTR(-ENOMEM); > + } > + > return new; > } > > @@ -2687,7 +2707,6 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, > kmem_cache_free(sn_cache, n); > return NULL; > } > - newpol->flags |= MPOL_F_SHARED; > sp_node_init(n, start, end, newpol); > > return n; > @@ -2720,7 +2739,10 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, > goto alloc_new; > > *mpol_new = *n->policy; > - atomic_set(&mpol_new->refcnt, 1); > + ret = percpu_ref_init(&mpol_new->refcnt, > + mpol_release, 0, GFP_KERNEL); > + if (ret) > + goto err_out; > sp_node_init(n_new, end, n->end, mpol_new); > n->end = start; > sp_insert(sp, n_new); > @@ -2756,7 +2778,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, > mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); > if (!mpol_new) > goto err_out; > - atomic_set(&mpol_new->refcnt, 1); > + > goto restart; > } > > @@ -2917,7 +2939,8 @@ void __init numa_policy_init(void) > preferred_node_policy[nid] = (struct mempolicy) { > .refcnt = ATOMIC_INIT(1), > .mode = MPOL_PREFERRED, > - .flags = MPOL_F_MOF | MPOL_F_MORON, > + .flags = MPOL_F_MOF | MPOL_F_MORON > + | MPOL_F_STATIC, > .nodes = nodemask_of_node(nid), > }; > } > -- > 2.25.1 -- Michal Hocko SUSE Labs