On Mon, 8 Mar 2010, Miao Xie wrote: > Changes from V1 to V2: > - Update task->mems_allowed lazily, instead of using a lock to protect it > > Before applying this patch, cpuset updates task->mems_allowed by setting all > new bits in the nodemask first, and clearing all old unallowed bits later. > But in the way, the allocator is likely to see an empty nodemask. > Likely? It's probably rarer than one in a million. > The problem is following: > The size of nodemask_t is greater than the size of long integer, so loading > and storing of nodemask_t are not atomic operations. If task->mems_allowed > don't intersect with new_mask, such as the first word of the mask is empty > and only the first word of new_mask is not empty. When the allocator > loads a word of the mask before > > current->mems_allowed |= new_mask; > > and then loads another word of the mask after > > current->mems_allowed = new_mask; > > the allocator gets an empty nodemask. > > Considering the change of task->mems_allowed is not frequent, so in this patch, > I use two variables as a tag to indicate whether task->mems_allowed need be > update or not. And before setting the tag, cpuset caches the new mask of every > task at its task_struct. > So what exactly is the benefit of 58568d2 from last June that caused this issue to begin with? It seems like this entire patchset is a revert of that commit. So why shouldn't we just revert that one commit and then add the locking and updating necessary for configs where MAX_NUMNODES > BITS_PER_LONG on top? > diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h > index a5740fc..2eb0fa7 100644 > --- a/include/linux/cpuset.h > +++ b/include/linux/cpuset.h > @@ -93,6 +93,44 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p); > static inline void set_mems_allowed(nodemask_t nodemask) > { > current->mems_allowed = nodemask; > + current->mems_allowed_for_update = nodemask; > +} > + > +#define task_mems_lock_irqsave(p, flags) \ > + do { \ > + spin_lock_irqsave(&p->mems_lock, flags); \ > + } while (0) > + > +#define task_mems_unlock_irqrestore(p, flags) \ > + do { \ > + spin_unlock_irqrestore(&p->mems_lock, flags); \ > + } while (0) We don't need mems_lock at all for 99% of the machines running Linux where the update can be done atomically, so these need to be redefined to be a no-op for those users. > + > +#include <linux/mempolicy.h> #includes should be at the top of include/linux/cpuset.h. > +/** > + * cpuset_update_task_mems_allowed - update task memory placement > + * > + * If the current task's mems_allowed_for_update and mempolicy_for_update are > + * changed by cpuset behind our backs, update current->mems_allowed, > + * mems_generation and task NUMA mempolicy to the new value. > + * > + * Call WITHOUT mems_lock held. > + * > + * This routine is needed to update the pre-task mems_allowed and mempolicy > + * within the tasks context, when it is trying to allocate memory. > + */ > +static __always_inline void cpuset_update_task_mems_allowed(void) > +{ > + struct task_struct *tsk = current; > + unsigned long flags; > + > + if (unlikely(tsk->mems_generation != tsk->mems_generation_for_update)) { > + task_mems_lock_irqsave(tsk, flags); > + tsk->mems_allowed = tsk->mems_allowed_for_update; > + tsk->mems_generation = tsk->mems_generation_for_update; > + task_mems_unlock_irqrestore(tsk, flags); By this synchronization, you're guaranteeing that no other kernel code ever reads tsk->mems_allowed when tsk != current? Otherwise, you're simply protecting the store to tsk->mems_allowed here and not serializing on the loads that can return empty nodemasks. > + mpol_rebind_task(tsk, &tsk->mems_allowed); > + } > } > > #else /* !CONFIG_CPUSETS */ > @@ -193,6 +231,13 @@ static inline void set_mems_allowed(nodemask_t nodemask) > { > } > > +static inline void cpuset_update_task_mems_allowed(void) > +{ > +} > + > +#define task_mems_lock_irqsave(p, flags) do { (void)(flags); } while (0) > + > +#define task_mems_unlock_irqrestore(p, flags) do { (void)(flags); } while (0) > #endif /* !CONFIG_CPUSETS */ > > #endif /* _LINUX_CPUSET_H */ > diff --git a/include/linux/init_task.h b/include/linux/init_task.h > index abec69b..be016f0 100644 > --- a/include/linux/init_task.h > +++ b/include/linux/init_task.h > @@ -103,7 +103,7 @@ extern struct group_info init_groups; > extern struct cred init_cred; > > #ifdef CONFIG_PERF_EVENTS > -# define INIT_PERF_EVENTS(tsk) \ > +# define INIT_PERF_EVENTS(tsk) \ Probably not intended. > .perf_event_mutex = \ > __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ > .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), > @@ -111,6 +111,22 @@ extern struct cred init_cred; > # define INIT_PERF_EVENTS(tsk) > #endif > > +#ifdef CONFIG_CPUSETS > +# define INIT_MEMS_ALLOWED(tsk) \ > + .mems_lock = __SPIN_LOCK_UNLOCKED(tsk.mems_lock), \ > + .mems_generation = 0, \ > + .mems_generation_for_update = 0, > +#else > +# define INIT_MEMS_ALLOWED(tsk) > +#endif > + > +#ifdef CONFIG_NUMA > +# define INIT_MEMPOLICY \ > + .mempolicy = NULL, This has never been needed before. > +#else > +# define INIT_MEMPOLICY > +#endif > + > /* > * INIT_TASK is used to set up the first task table, touch at > * your own risk!. Base=0, limit=0x1fffff (=2MB) > @@ -180,6 +196,8 @@ extern struct cred init_cred; > INIT_FTRACE_GRAPH \ > INIT_TRACE_RECURSION \ > INIT_TASK_RCU_PREEMPT(tsk) \ > + INIT_MEMS_ALLOWED(tsk) \ > + INIT_MEMPOLICY \ > } > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 46c6f8d..9e7f14f 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1351,8 +1351,9 @@ struct task_struct { > /* Thread group tracking */ > u32 parent_exec_id; > u32 self_exec_id; > -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, > - * mempolicy */ > +/* > + * Protection of (de-)allocation: mm, files, fs, tty, keyrings > + */ > spinlock_t alloc_lock; > > #ifdef CONFIG_GENERIC_HARDIRQS > @@ -1420,8 +1421,36 @@ struct task_struct { > cputime_t acct_timexpd; /* stime + utime since last update */ > #endif > #ifdef CONFIG_CPUSETS > - nodemask_t mems_allowed; /* Protected by alloc_lock */ > + /* > + * It is unnecessary to protect mems_allowed, because it only can be > + * loaded and stored by current task's self > + */ > + nodemask_t mems_allowed; > int cpuset_mem_spread_rotor; > + > + /* Protection of ->mems_allowed_for_update */ > + spinlock_t mems_lock; > + /* > + * This variable(mems_allowed_for_update) are just used for caching > + * memory placement information. > + * > + * ->mems_allowed are used by the kernel allocator. > + */ > + nodemask_t mems_allowed_for_update; /* Protected by mems_lock */ Another nodemask_t in struct task_struct for this? And for all configs, including those that can do atomic updates to mems_allowed? > + > + /* > + * Increment this integer everytime ->mems_allowed_for_update is > + * changed by cpuset. Task can compare this number with mems_generation, > + * and if they are not the same, mems_allowed_for_update is changed and > + * ->mems_allowed must be updated. In this way, tasks can avoid having > + * to lock and reload mems_allowed_for_update unless it is changed. > + */ > + int mems_generation_for_update; > + /* > + * After updating mems_allowed, set mems_generation to > + * mems_generation_for_update. > + */ > + int mems_generation; I don't see why you need two mems_generation numbers, one should belong in the task's cpuset. Then you can compare tsk->mems_generation to task_cs(tsk)->mems_generation at cpuset_update_task_memory_state() if you set tsk->mems_generation = task_cs(tsk)->mems_generation on cpuset_attach() or update_nodemask(). > #endif > #ifdef CONFIG_CGROUPS > /* Control Group info protected by css_set_lock */ > @@ -1443,7 +1472,11 @@ struct task_struct { > struct list_head perf_event_list; > #endif > #ifdef CONFIG_NUMA > - struct mempolicy *mempolicy; /* Protected by alloc_lock */ > + /* > + * It is unnecessary to protect mempolicy, because it only can be > + * loaded/stored by current task's self. > + */ > + struct mempolicy *mempolicy; That's going to change soon since my oom killer rewrite protects tsk->mempolicy under task_lock(). -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>