On Mon, 5 Oct 2009, Lee Schermerhorn wrote: > Index: linux-2.6.31-mmotm-090925-1435/mm/mempolicy.c > =================================================================== > --- linux-2.6.31-mmotm-090925-1435.orig/mm/mempolicy.c 2009-09-30 12:48:45.000000000 -0400 > +++ linux-2.6.31-mmotm-090925-1435/mm/mempolicy.c 2009-09-30 12:48:46.000000000 -0400 > @@ -1564,6 +1564,53 @@ struct zonelist *huge_zonelist(struct vm > } > return zl; > } > + > +/* > + * init_nodemask_of_mempolicy > + * > + * If the current task's mempolicy is "default" [NULL], return 'false' > + * to indicate * default policy. Otherwise, extract the policy nodemask > + * for 'bind' * or 'interleave' policy into the argument nodemask, or > + * initialize the argument nodemask to contain the single node for > + * 'preferred' or * 'local' policy and return 'true' to indicate presence > + * of non-default mempolicy. > + * Looks like some mangling of the comment, there's spurious '*' throughout. > + * We don't bother with reference counting the mempolicy [mpol_get/put] > + * because the current task is examining it's own mempolicy and a task's > + * mempolicy is only ever changed by the task itself. > + * > + * N.B., it is the caller's responsibility to free a returned nodemask. > + */ > +bool init_nodemask_of_mempolicy(nodemask_t *mask) > +{ > + struct mempolicy *mempolicy; > + int nid; > + > + if (!current->mempolicy) > + return false; > + > + mempolicy = current->mempolicy; > + switch (mempolicy->mode) { > + case MPOL_PREFERRED: > + if (mempolicy->flags & MPOL_F_LOCAL) > + nid = numa_node_id(); > + else > + nid = mempolicy->v.preferred_node; > + init_nodemask_of_node(mask, nid); > + break; > + > + case MPOL_BIND: > + /* Fall through */ > + case MPOL_INTERLEAVE: > + *mask = mempolicy->v.nodes; > + break; > + > + default: > + BUG(); > + } > + > + return true; > +} > #endif > > /* Allocate a page in interleaved policy. > Index: linux-2.6.31-mmotm-090925-1435/include/linux/mempolicy.h > =================================================================== > --- linux-2.6.31-mmotm-090925-1435.orig/include/linux/mempolicy.h 2009-09-30 12:48:45.000000000 -0400 > +++ linux-2.6.31-mmotm-090925-1435/include/linux/mempolicy.h 2009-09-30 12:48:46.000000000 -0400 > @@ -201,6 +201,7 @@ extern void mpol_fix_fork_child_flag(str > extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, > unsigned long addr, gfp_t gfp_flags, > struct mempolicy **mpol, nodemask_t **nodemask); > +extern bool init_nodemask_of_mempolicy(nodemask_t *mask); > extern unsigned slab_node(struct mempolicy *policy); > > extern enum zone_type policy_zone; > @@ -328,6 +329,8 @@ static inline struct zonelist *huge_zone > return node_zonelist(0, gfp_flags); > } > > +static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } > + > static inline int do_migrate_pages(struct mm_struct *mm, > const nodemask_t *from_nodes, > const nodemask_t *to_nodes, int flags) > Index: linux-2.6.31-mmotm-090925-1435/mm/hugetlb.c > =================================================================== > --- linux-2.6.31-mmotm-090925-1435.orig/mm/hugetlb.c 2009-09-30 12:48:45.000000000 -0400 > +++ linux-2.6.31-mmotm-090925-1435/mm/hugetlb.c 2009-10-02 21:22:04.000000000 -0400 > @@ -1334,29 +1334,71 @@ static struct hstate *kobj_to_hstate(str > return NULL; > } > > -static ssize_t nr_hugepages_show(struct kobject *kobj, > +static ssize_t nr_hugepages_show_common(struct kobject *kobj, > struct kobj_attribute *attr, char *buf) > { > struct hstate *h = kobj_to_hstate(kobj); > return sprintf(buf, "%lu\n", h->nr_huge_pages); > } > -static ssize_t nr_hugepages_store(struct kobject *kobj, > - struct kobj_attribute *attr, const char *buf, size_t count) > +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, > + struct kobject *kobj, struct kobj_attribute *attr, > + const char *buf, size_t len) > { > int err; > - unsigned long input; > + unsigned long count; > struct hstate *h = kobj_to_hstate(kobj); > + NODEMASK_ALLOC(nodemask, nodes_allowed); > In the two places you do NODEMASK_ALLOC(), here and hugetlb_sysctl_handler(), you'll need to check that nodes_allowed is non-NULL since it's possible that kmalloc() will return NULL for CONFIG_NODES_SHIFT > 8. In such a case, it's probably sufficient to simply set nodes_allowed to node_states[N_HIGH_MEMORY] so that we can still free hugepages when we're oom, a common memory freeing tactic. You could do that by simply returning false from init_nodemask_of_mempolicy() if !nodes_allowed since NODEMASK_FREE() can take a NULL pointer, but it may be easier to factor that logic into your conditional below: > - err = strict_strtoul(buf, 10, &input); > + err = strict_strtoul(buf, 10, &count); > if (err) > return 0; > > - h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map); > + if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) { > + NODEMASK_FREE(nodes_allowed); > + nodes_allowed = &node_online_map; > + } > + h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); > You can get away with just testing !nodes_allowed here since the stack allocation variation of NODEMASK_ALLOC() is such that nodes_allowed will always be an initialized pointer pointing to _nodes_allowed so you won't have an uninitialized warning. Once that's done, you can get rid of the check for a NULL nodes_allowed in try_to_free_low() from patch 2 since it will always be valid in set_max_huge_pages(). > - return count; > + if (nodes_allowed != &node_online_map) > + NODEMASK_FREE(nodes_allowed); > + > + return len; > +} > + > +static ssize_t nr_hugepages_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return nr_hugepages_show_common(kobj, attr, buf); > +} > + > +static ssize_t nr_hugepages_store(struct kobject *kobj, > + struct kobj_attribute *attr, const char *buf, size_t len) > +{ > + return nr_hugepages_store_common(false, kobj, attr, buf, len); > } > HSTATE_ATTR(nr_hugepages); > > +#ifdef CONFIG_NUMA > + > +/* > + * hstate attribute for optionally mempolicy-based constraint on persistent > + * huge page alloc/free. > + */ > +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + return nr_hugepages_show_common(kobj, attr, buf); > +} > + > +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, > + struct kobj_attribute *attr, const char *buf, size_t len) > +{ > + return nr_hugepages_store_common(true, kobj, attr, buf, len); > +} > +HSTATE_ATTR(nr_hugepages_mempolicy); > +#endif > + > + > static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, > struct kobj_attribute *attr, char *buf) > { > @@ -1412,6 +1454,9 @@ static struct attribute *hstate_attrs[] > &free_hugepages_attr.attr, > &resv_hugepages_attr.attr, > &surplus_hugepages_attr.attr, > +#ifdef CONFIG_NUMA > + &nr_hugepages_mempolicy_attr.attr, > +#endif > NULL, > }; > > @@ -1578,9 +1623,9 @@ static unsigned int cpuset_mems_nr(unsig > } > > #ifdef CONFIG_SYSCTL > -int hugetlb_sysctl_handler(struct ctl_table *table, int write, > - void __user *buffer, > - size_t *length, loff_t *ppos) > +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, > + struct ctl_table *table, int write, > + void __user *buffer, size_t *length, loff_t *ppos) > { > struct hstate *h = &default_hstate; > unsigned long tmp; > @@ -1592,13 +1637,39 @@ int hugetlb_sysctl_handler(struct ctl_ta > table->maxlen = sizeof(unsigned long); > proc_doulongvec_minmax(table, write, buffer, length, ppos); > > - if (write) > - h->max_huge_pages = set_max_huge_pages(h, tmp, > - &node_online_map); > + if (write) { > + NODEMASK_ALLOC(nodemask, nodes_allowed); > + if (!(obey_mempolicy && > + init_nodemask_of_mempolicy(nodes_allowed))) { > + NODEMASK_FREE(nodes_allowed); > + nodes_allowed = &node_states[N_HIGH_MEMORY]; > + } > + h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); > + > + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) > + NODEMASK_FREE(nodes_allowed); > + } > > return 0; > } > > +int hugetlb_sysctl_handler(struct ctl_table *table, int write, > + void __user *buffer, size_t *length, loff_t *ppos) > +{ > + > + return hugetlb_sysctl_handler_common(false, table, write, > + buffer, length, ppos); > +} > + > +#ifdef CONFIG_NUMA > +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, > + void __user *buffer, size_t *length, loff_t *ppos) > +{ > + return hugetlb_sysctl_handler_common(true, table, write, > + buffer, length, ppos); > +} > +#endif /* CONFIG_NUMA */ > + > int hugetlb_treat_movable_handler(struct ctl_table *table, int write, > void __user *buffer, > size_t *length, loff_t *ppos) > Index: linux-2.6.31-mmotm-090925-1435/include/linux/hugetlb.h > =================================================================== > --- linux-2.6.31-mmotm-090925-1435.orig/include/linux/hugetlb.h 2009-09-30 12:48:45.000000000 -0400 > +++ linux-2.6.31-mmotm-090925-1435/include/linux/hugetlb.h 2009-09-30 12:48:46.000000000 -0400 > @@ -23,6 +23,12 @@ void reset_vma_resv_huge_pages(struct vm > int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); > int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); > int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); > + > +#ifdef CONFIG_NUMA > +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, > + void __user *, size_t *, loff_t *); > +#endif > + > int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); > int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, > struct page **, struct vm_area_struct **, > Index: linux-2.6.31-mmotm-090925-1435/kernel/sysctl.c > =================================================================== > --- linux-2.6.31-mmotm-090925-1435.orig/kernel/sysctl.c 2009-09-30 12:48:45.000000000 -0400 > +++ linux-2.6.31-mmotm-090925-1435/kernel/sysctl.c 2009-09-30 12:48:46.000000000 -0400 > @@ -1164,7 +1164,7 @@ static struct ctl_table vm_table[] = { > .extra2 = &one_hundred, > }, > #ifdef CONFIG_HUGETLB_PAGE > - { > + { > .procname = "nr_hugepages", > .data = NULL, > .maxlen = sizeof(unsigned long), > @@ -1172,7 +1172,19 @@ static struct ctl_table vm_table[] = { > .proc_handler = &hugetlb_sysctl_handler, > .extra1 = (void *)&hugetlb_zero, > .extra2 = (void *)&hugetlb_infinity, > - }, > + }, > +#ifdef CONFIG_NUMA > + { > + .ctl_name = CTL_UNNUMBERED, > + .procname = "nr_hugepages_mempolicy", > + .data = NULL, > + .maxlen = sizeof(unsigned long), > + .mode = 0644, > + .proc_handler = &hugetlb_mempolicy_sysctl_handler, > + .extra1 = (void *)&hugetlb_zero, > + .extra2 = (void *)&hugetlb_infinity, > + }, > +#endif > { > .ctl_name = VM_HUGETLB_GROUP, > .procname = "hugetlb_shm_group", > There's some whitespace damage in the nr_hugepages_mempolicy hunk, it needs tabs instead of spaces for alignment. -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html