if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in task->mempolicy are not atomic operations, and the kernel page allocator gets an empty mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we use a rwlock to protect them to fix this probelm. Signed-off-by: Miao Xie <miaox@xxxxxxxxxxxxxx> --- include/linux/cpuset.h | 104 +++++++++++++++++++++++++++++- include/linux/init_task.h | 8 +++ include/linux/mempolicy.h | 24 ++++++-- include/linux/sched.h | 17 ++++- kernel/cpuset.c | 113 +++++++++++++++++++++++++++------ kernel/exit.c | 4 + kernel/fork.c | 13 ++++- mm/hugetlb.c | 3 + mm/mempolicy.c | 153 ++++++++++++++++++++++++++++++++++---------- mm/slab.c | 27 +++++++- mm/slub.c | 10 +++ 11 files changed, 403 insertions(+), 73 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a5740fc..b7a9ab0 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -53,8 +53,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask); } -extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2); +extern int cpuset_mems_allowed_intersects(struct task_struct *tsk1, + struct task_struct *tsk2); #define cpuset_memory_pressure_bump() \ do { \ @@ -90,9 +90,92 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); +# if MAX_NUMNODES > BITS_PER_LONG +/* + * Be used to protect task->mempolicy and mems_allowed when reading them for + * page allocation. + * + * we don't care that the kernel page allocator allocate a page on a node in + * the old mems_allowed, which isn't a big deal, especially since it was + * previously allowed. + * + * We just worry whether the kernel page allocator gets an empty mems_allowed + * or not. But + * if MAX_NUMNODES <= BITS_PER_LONG, loading/storing task->mems_allowed are + * atomic operations. So we needn't do anything to protect the loading of + * task->mems_allowed. + * + * if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed are + * not atomic operations. So we use a rwlock to protect the loading of + * task->mems_allowed. + */ +#define read_mem_lock_irqsave(p, flags) \ + do { \ + read_lock_irqsave(&p->mem_rwlock, flags); \ + } while (0) + +#define read_mem_unlock_irqrestore(p, flags) \ + do { \ + read_unlock_irqrestore(&p->mem_rwlock, flags); \ + } while (0) + +/* Used to protect task->mempolicy and mems_allowed when user get mempolciy */ +#define read_mempolicy_lock_irqsave(p, flags) \ + do { \ + read_lock_irqsave(&p->mem_rwlock, flags); \ + } while (0) + +#define read_mempolicy_unlock_irqrestore(p, flags) \ + do { \ + read_unlock_irqrestore(&p->mem_rwlock, flags); \ + } while (0) + +#define write_mem_lock_irqsave(p, flags) \ + do { \ + write_lock_irqsave(&p->mem_rwlock, flags); \ + } while (0) + +#define write_mem_unlock_irqrestore(p, flags) \ + do { \ + write_unlock_irqrestore(&p->mem_rwlock, flags); \ + } while (0) +# else +#define read_mem_lock_irqsave(p, flags) do { (void)(flags); } while (0) + +#define read_mem_unlock_irqrestore(p, flags) do { (void)(flags); } while (0) + +/* Be used to protect task->mempolicy and mems_allowed when user reads them */ +#define read_mempolicy_lock_irqsave(p, flags) \ + do { \ + task_lock(p); \ + (void)(flags); \ + } while (0) + +#define read_mempolicy_unlock_irqrestore(p, flags) \ + do { \ + task_unlock(p); \ + (void)(flags); \ + } while (0) + +#define write_mem_lock_irqsave(p, flags) \ + do { \ + task_lock(p); \ + (void)(flags); \ + } while (0) + +#define write_mem_unlock_irqrestore(p, flags) \ + do { \ + task_unlock(p); \ + (void)(flags); \ + } while (0) +# endif + static inline void set_mems_allowed(nodemask_t nodemask) { + unsigned long flags; + write_mem_lock_irqsave(current, flags); current->mems_allowed = nodemask; + write_mem_unlock_irqrestore(current, flags); } #else /* !CONFIG_CPUSETS */ @@ -144,8 +227,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) return 1; } -static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2) +static inline int cpuset_mems_allowed_intersects(struct task_struct *tsk1, + struct task_struct *tsk2) { return 1; } @@ -193,6 +276,19 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } +#define read_mem_lock_irqsave(p, flags) do { (void)(flags); } while (0) + +#define read_mem_unlock_irqrestore(p, flags) do { (void)(flags); } while (0) + +#define read_mempolicy_lock_irqsave(p, flags) do { (void)(flags); } while (0) + +#define read_mempolicy_unlock_irqrestore(p, flags) \ + do { (void)(flags); } while (0) + +#define write_mem_lock_irqsave(p, flags) do { (void)(flags); } while (0) + +#define write_mem_unlock_irqrestore(p, flags) do { (void)(flags); } while (0) + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index abec69b..1c1e3bf 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -111,6 +111,13 @@ extern struct cred init_cred; # define INIT_PERF_EVENTS(tsk) #endif +#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG +# define INIT_MEM_RWLOCK(tsk) \ + .mem_rwlock = __RW_LOCK_UNLOCKED(tsk.mem_rwlock), +#else +# define INIT_MEM_RWLOCK(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -180,6 +187,7 @@ extern struct cred init_cred; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_MEM_RWLOCK(tsk) \ } diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1cc966c..aae93bc 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -51,6 +51,7 @@ enum { */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ +#define MPOL_F_TASK (1 << 2) /* identify tasks' policies */ #ifdef __KERNEL__ @@ -107,6 +108,12 @@ struct mempolicy { * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ +extern struct mempolicy *__mpol_alloc(void); +static inline struct mempolicy *mpol_alloc(void) +{ + return __mpol_alloc(); +} + extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { @@ -125,7 +132,7 @@ static inline int mpol_needs_cond_ref(struct mempolicy *pol) static inline void mpol_cond_put(struct mempolicy *pol) { - if (mpol_needs_cond_ref(pol)) + if (mpol_needs_cond_ref(pol) || (pol && (pol->flags & MPOL_F_TASK))) __mpol_put(pol); } @@ -193,8 +200,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, extern void numa_default_policy(void); extern void numa_policy_init(void); -extern void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new); +extern int mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + struct mempolicy *newpol); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern void mpol_fix_fork_child_flag(struct task_struct *p); @@ -249,6 +256,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) return 1; } +static inline struct mempolicy *mpol_alloc(void) +{ + return NULL; +} + static inline void mpol_put(struct mempolicy *p) { } @@ -307,9 +319,11 @@ static inline void numa_default_policy(void) { } -static inline void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new) +static inline int mpol_rebind_task(struct task_struct *tsk, + const nodemask_t *new, + struct mempolicy *newpol) { + return 0; } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1753f..8401e7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1403,8 +1403,9 @@ struct task_struct { /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, - * mempolicy */ +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings. + * if MAX_NUMNODES <= BITS_PER_LONG,it will protect mems_allowed and mempolicy. + * Or we use other rwlock - mem_rwlock to protect them. */ spinlock_t alloc_lock; #ifdef CONFIG_GENERIC_HARDIRQS @@ -1472,7 +1473,13 @@ struct task_struct { cputime_t acct_timexpd; /* stime + utime since last update */ #endif #ifdef CONFIG_CPUSETS - nodemask_t mems_allowed; /* Protected by alloc_lock */ +# if MAX_NUMNODES > BITS_PER_LONG + /* Protection of mems_allowed, and mempolicy */ + rwlock_t mem_rwlock; +# endif + /* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock; + * else Protected by mem_rwlock */ + nodemask_t mems_allowed; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS @@ -1495,7 +1502,9 @@ struct task_struct { struct list_head perf_event_list; #endif #ifdef CONFIG_NUMA - struct mempolicy *mempolicy; /* Protected by alloc_lock */ + /* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock; + * else Protected by mem_rwlock */ + struct mempolicy *mempolicy; short il_next; #endif atomic_t fs_excl; /* holding fs exclusive resources */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c6edd06..7575e79 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -198,12 +198,13 @@ static struct cpuset top_cpuset = { * from one of the callbacks into the cpuset code from within * __alloc_pages(). * - * If a task is only holding callback_mutex, then it has read-only - * access to cpusets. + * If a task is only holding callback_mutex or cgroup_mutext, then it has + * read-only access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed - * by other task, we use alloc_lock in the task_struct fields to protect - * them. + * by other task, we use alloc_lock(if MAX_NUMNODES <= BITS_PER_LONG) or + * mem_rwlock(if MAX_NUMNODES > BITS_PER_LONG) in the task_struct fields + * to protect them. * * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word @@ -920,6 +921,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * + * As the above comment said, no one can change current task's mems_allowed + * except itself. so we needn't hold lock to protect task's mems_allowed + * during this call. + * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -961,15 +966,19 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, * we structure updates as setting all new allowed nodes, then clearing newly * disallowed ones. * - * Called with task's alloc_lock held + * Called with write_mem_lock held */ -static void cpuset_change_task_nodemask(struct task_struct *tsk, - nodemask_t *newmems) +static int cpuset_change_task_nodemask(struct task_struct *tsk, + nodemask_t *newmems, + struct mempolicy *newpol) { + int retval; + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, &tsk->mems_allowed); - mpol_rebind_task(tsk, newmems); + retval = mpol_rebind_task(tsk, newmems, newpol); tsk->mems_allowed = *newmems; + + return retval; } /* @@ -984,17 +993,31 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; + unsigned long flags; + struct mempolicy *newpol = NULL; + int retval; NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); if (!newmems) return; +#if MAX_NUMNODES > BITS_PER_LONG + newpol = mpol_alloc(); + if (newpol == NULL) { + NODEMASK_FREE(newmems); + return; + } +#endif + cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, newmems); - task_lock(p); - cpuset_change_task_nodemask(p, newmems); - task_unlock(p); + write_mem_lock_irqsave(p, flags); + retval = cpuset_change_task_nodemask(p, newmems, newpol); + write_mem_unlock_irqrestore(p, flags); + + if (retval) + mpol_put(newpol); NODEMASK_FREE(newmems); @@ -1389,6 +1412,8 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, struct cpuset *cs) { + struct mempolicy *newpol = NULL; + unsigned long flags; int err; /* * can_attach beforehand should guarantee that this doesn't fail. @@ -1397,9 +1422,19 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - task_lock(tsk); - cpuset_change_task_nodemask(tsk, to); - task_unlock(tsk); +#if MAX_NUMNODES > BITS_PER_LONG + newpol = mpol_alloc(); + if (newpol == NULL) + return; +#endif + + write_mem_lock_irqsave(tsk, flags); + err = cpuset_change_task_nodemask(tsk, to, newpol); + write_mem_unlock_irqrestore(tsk, flags); + + if (err) + mpol_put(newpol); + cpuset_update_task_spread_flag(cs, tsk); } @@ -2242,7 +2277,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) */ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { - return nodes_intersects(*nodemask, current->mems_allowed); + unsigned long flags; + int retval; + + read_mem_lock_irqsave(current, flags); + retval = nodes_intersects(*nodemask, current->mems_allowed); + read_mem_unlock_irqrestore(current, flags); + + return retval; } /* @@ -2323,11 +2365,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { const struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ + unsigned long flags; if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); - if (node_isset(node, current->mems_allowed)) + + read_mem_lock_irqsave(current, flags); + allowed = node_isset(node, current->mems_allowed); + read_mem_unlock_irqrestore(current, flags); + + if (allowed) return 1; /* * Allow tasks that have access to memory reserves because they have @@ -2378,9 +2426,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) */ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) { + int allowed; + unsigned long flags; + if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; - if (node_isset(node, current->mems_allowed)) + + read_mem_lock_irqsave(current, flags); + allowed = node_isset(node, current->mems_allowed); + read_mem_unlock_irqrestore(current, flags); + + if (allowed) return 1; /* * Allow tasks that have access to memory reserves because they have @@ -2447,11 +2503,14 @@ void cpuset_unlock(void) int cpuset_mem_spread_node(void) { int node; + unsigned long flags; + read_mem_lock_irqsave(current, flags); node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); if (node == MAX_NUMNODES) node = first_node(current->mems_allowed); current->cpuset_mem_spread_rotor = node; + read_mem_unlock_irqrestore(current, flags); return node; } EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); @@ -2467,10 +2526,19 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); * to the other. **/ -int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2) +int cpuset_mems_allowed_intersects(struct task_struct *tsk1, + struct task_struct *tsk2) { - return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); + unsigned long flags1, flags2; + int retval; + + read_mem_lock_irqsave(tsk1, flags1); + read_mem_lock_irqsave(tsk2, flags2); + retval = nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); + read_mem_unlock_irqrestore(tsk2, flags2); + read_mem_unlock_irqrestore(tsk1, flags1); + + return retval; } /** @@ -2483,14 +2551,17 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { + unsigned long flags; struct dentry *dentry; dentry = task_cs(tsk)->css.cgroup->dentry; spin_lock(&cpuset_buffer_lock); snprintf(cpuset_name, CPUSET_NAME_LEN, dentry ? (const char *)dentry->d_name.name : "/"); + read_mem_lock_irqsave(tsk, flags); nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); + read_mem_unlock_irqrestore(tsk, flags); printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", tsk->comm, cpuset_name, cpuset_nodelist); spin_unlock(&cpuset_buffer_lock); diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043..28162dd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -16,6 +16,7 @@ #include <linux/key.h> #include <linux/security.h> #include <linux/cpu.h> +#include <linux/cpuset.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> @@ -901,6 +902,7 @@ NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; int group_dead; + unsigned long flags; profile_task_exit(tsk); @@ -1001,8 +1003,10 @@ NORET_TYPE void do_exit(long code) exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA + write_mem_lock_irqsave(tsk, flags); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; + write_mem_unlock_irqrestore(tsk, flags); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf09..7ed253d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -32,6 +32,7 @@ #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cgroup.h> +#include <linux/cpuset.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/swap.h> @@ -986,6 +987,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, int retval; struct task_struct *p; int cgroup_callbacks_done = 0; + struct mempolicy *pol; + unsigned long flags; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1091,8 +1094,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); +#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG + rwlock_init(&p->mem_rwlock); +#endif #ifdef CONFIG_NUMA - p->mempolicy = mpol_dup(p->mempolicy); + read_mem_lock_irqsave(current, flags); + pol = current->mempolicy; + mpol_get(pol); + read_mem_unlock_irqrestore(current, flags); + p->mempolicy = mpol_dup(pol); + mpol_put(pol); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a5aeb3..523cf46 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1836,9 +1836,12 @@ static unsigned int cpuset_mems_nr(unsigned int *array) { int node; unsigned int nr = 0; + unsigned long flags; + read_mem_lock_irqsave(current, flags); for_each_node_mask(node, cpuset_current_mems_allowed) nr += array[node]; + read_mem_unlock_irqrestore(current, flags); return nr; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 290fb5b..324dfc3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -190,8 +190,9 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * parameter with respect to the policy mode and flags. But, we need to * handle an empty nodemask with MPOL_PREFERRED here. * - * Must be called holding task's alloc_lock to protect task's mems_allowed - * and mempolicy. May also be called holding the mmap_semaphore for write. + * Must be called using write_mem_lock_irqsave()/write_mem_unlock_irqrestore() + * to protect task's mems_allowed and mempolicy. May also be called holding + * the mmap_semaphore for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) @@ -270,6 +271,16 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return policy; } +struct mempolicy *__mpol_alloc(void) +{ + struct mempolicy *pol; + + pol = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (pol) + atomic_set(&pol->refcnt, 1); + return pol; +} + /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *p) { @@ -347,12 +358,30 @@ static void mpol_rebind_policy(struct mempolicy *pol, * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * - * Called with task's alloc_lock held. + * if task->pol==NULL, it will return -1, and tell us it is unnecessary to + * rebind task's mempolicy. + * + * Using write_mem_lock_irqsave()/write_mem_unlock_irqrestore() to protect it. */ - -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +int mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + struct mempolicy *newpol) { +#if MAX_NUMNODES > BITS_PER_LONG + struct mempolicy *pol = tsk->mempolicy; + + if (!pol) + return -1; + + *newpol = *pol; + atomic_set(&newpol->refcnt, 1); + + mpol_rebind_policy(newpol, new); + tsk->mempolicy = newpol; + mpol_put(pol); +#else mpol_rebind_policy(tsk->mempolicy, new); +#endif + return 0; } /* @@ -621,12 +650,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, struct mempolicy *new, *old; struct mm_struct *mm = current->mm; NODEMASK_SCRATCH(scratch); + unsigned long irqflags; int ret; if (!scratch) return -ENOMEM; - new = mpol_new(mode, flags, nodes); + new = mpol_new(mode, flags | MPOL_F_TASK, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; @@ -639,10 +669,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, */ if (mm) down_write(&mm->mmap_sem); - task_lock(current); + write_mem_lock_irqsave(current, irqflags); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { - task_unlock(current); + write_mem_unlock_irqrestore(current, irqflags); if (mm) up_write(&mm->mmap_sem); mpol_put(new); @@ -654,7 +684,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); - task_unlock(current); + write_mem_unlock_irqrestore(current, irqflags); if (mm) up_write(&mm->mmap_sem); @@ -668,7 +698,9 @@ out: /* * Return nodemask for policy for get_mempolicy() query * - * Called with task's alloc_lock held + * Must be called using read_mempolicy_lock_irqsave()/ + * read_mempolicy_unlock_irqrestore() to + * protect it. */ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { @@ -712,7 +744,8 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = NULL; + unsigned long irqflags; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -722,9 +755,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ - task_lock(current); + + read_mempolicy_lock_irqsave(current, irqflags); *nmask = cpuset_current_mems_allowed; - task_unlock(current); + read_mempolicy_unlock_irqrestore(current, irqflags); return 0; } @@ -747,6 +781,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } else if (addr) return -EINVAL; + if (!pol) { + read_mempolicy_lock_irqsave(current, irqflags); + pol = current->mempolicy; + mpol_get(pol); + read_mempolicy_unlock_irqrestore(current, irqflags); + } + if (!pol) pol = &default_policy; /* indicates default behavior */ @@ -756,9 +797,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (err < 0) goto out; *policy = err; - } else if (pol == current->mempolicy && + } else if (pol->flags & MPOL_F_TASK && pol->mode == MPOL_INTERLEAVE) { + read_mempolicy_lock_irqsave(current, irqflags); *policy = current->il_next; + read_mempolicy_unlock_irqrestore(current, irqflags); } else { err = -EINVAL; goto out; @@ -780,9 +823,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, err = 0; if (nmask) { - task_lock(current); + /* Maybe task->mempolicy was updated by cpuset, so we must get + * a new one. */ + mpol_cond_put(pol); + read_mempolicy_lock_irqsave(current, irqflags); + pol = current->mempolicy; + if (pol) + mpol_get(pol); + else + pol = &default_policy; get_policy_nodemask(pol, nmask); - task_unlock(current); + read_mempolicy_unlock_irqrestore(current, irqflags); } out: @@ -981,6 +1032,7 @@ static long do_mbind(unsigned long start, unsigned long len, struct mempolicy *new; unsigned long end; int err; + unsigned long irqflags; LIST_HEAD(pagelist); if (flags & ~(unsigned long)(MPOL_MF_STRICT | @@ -1028,9 +1080,9 @@ static long do_mbind(unsigned long start, unsigned long len, NODEMASK_SCRATCH(scratch); if (scratch) { down_write(&mm->mmap_sem); - task_lock(current); + write_mem_lock_irqsave(current, irqflags); err = mpol_set_nodemask(new, nmask, scratch); - task_unlock(current); + write_mem_unlock_irqrestore(current, irqflags); if (err) up_write(&mm->mmap_sem); } else @@ -1370,7 +1422,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, static struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol = NULL; + unsigned long irqflags; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -1381,8 +1434,16 @@ static struct mempolicy *get_vma_policy(struct task_struct *task, } else if (vma->vm_policy) pol = vma->vm_policy; } + if (!pol) { + read_mem_lock_irqsave(task, irqflags); + pol = task->mempolicy; + mpol_get(pol); + read_mem_unlock_irqrestore(task, irqflags); + } + if (!pol) pol = &default_policy; + return pol; } @@ -1584,11 +1645,15 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; int nid; + unsigned long irqflags; if (!(mask && current->mempolicy)) return false; + read_mempolicy_lock_irqsave(current, irqflags); mempolicy = current->mempolicy; + mpol_get(mempolicy); + switch (mempolicy->mode) { case MPOL_PREFERRED: if (mempolicy->flags & MPOL_F_LOCAL) @@ -1608,6 +1673,9 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) BUG(); } + read_mempolicy_unlock_irqrestore(current, irqflags); + mpol_cond_put(mempolicy); + return true; } #endif @@ -1654,6 +1722,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; + struct page *page; if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1667,15 +1736,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, - zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, + policy_nodemask(gfp, pol)); __mpol_put(pol); return page; } /* * fast path: default or task policy */ - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + mpol_cond_put(pol); + return page; } /** @@ -1692,26 +1763,36 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. - * - * Don't call cpuset_update_task_memory_state() unless - * 1) it's ok to take cpuset_sem (can WAIT), and - * 2) allocating for current task (not interrupt). */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol; + struct page *page; + unsigned long irqflags; + + read_mem_lock_irqsave(current, irqflags); + pol = current->mempolicy; + mpol_get(pol); + read_mem_unlock_irqrestore(current, irqflags); - if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) + if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) { + mpol_put(pol); pol = &default_policy; + } /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) - return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol), + policy_nodemask(gfp, pol)); + + mpol_cond_put(pol); + return page; } EXPORT_SYMBOL(alloc_pages_current); @@ -1961,6 +2042,7 @@ restart: */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { + unsigned long irqflags; int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ @@ -1981,9 +2063,9 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) return; /* no valid nodemask intersection */ } - task_lock(current); + write_mem_lock_irqsave(current, irqflags); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); - task_unlock(current); + write_mem_unlock_irqrestore(current, irqflags); mpol_put(mpol); /* drop our ref on sb mpol */ if (ret) { NODEMASK_SCRATCH_FREE(scratch); @@ -2134,6 +2216,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int i; + unsigned long irqflags; int err = 1; if (nodelist) { @@ -2215,9 +2298,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) int ret; NODEMASK_SCRATCH(scratch); if (scratch) { - task_lock(current); + write_mem_lock_irqsave(current, irqflags); ret = mpol_set_nodemask(new, &nodes, scratch); - task_unlock(current); + write_mem_unlock_irqrestore(current, irqflags); } else ret = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); diff --git a/mm/slab.c b/mm/slab.c index 7451bda..2df5185 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3145,14 +3145,25 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) { int nid_alloc, nid_here; + struct mempolicy *pol; + unsigned long lflags; if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; + + read_mem_lock_irqsave(current, lflags); + pol = current->mempolicy; + mpol_get(pol); + read_mem_unlock_irqrestore(current, lflags); + nid_alloc = nid_here = numa_node_id(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_mem_spread_node(); - else if (current->mempolicy) - nid_alloc = slab_node(current->mempolicy); + else if (pol) + nid_alloc = slab_node(pol); + + mpol_put(pol); + if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3175,11 +3186,21 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; + struct mempolicy *pol; + unsigned long lflags; if (flags & __GFP_THISNODE) return NULL; - zonelist = node_zonelist(slab_node(current->mempolicy), flags); + read_mem_lock_irqsave(current, lflags); + pol = current->mempolicy; + mpol_get(pol); + read_mem_unlock_irqrestore(current, lflags); + + zonelist = node_zonelist(slab_node(pol), flags); + + mpol_put(pol); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry: diff --git a/mm/slub.c b/mm/slub.c index 8d71aaf..cb533d4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1357,6 +1357,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); struct page *page; + struct mempolicy *pol; + unsigned long lflags /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1380,7 +1382,15 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + read_mem_lock_irqsave(current, lflags); + pol = current->mempolicy; + mpol_get(pol); + read_mem_unlock_irqrestore(current, lflags); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + + mpol_put(pol); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; -- 1.6.5.2 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>