PATCH 11/15 Mempolicy: use MPOL_F_LOCAL to indicate preferred local policy Against: 2.6.25-rc8-mm1 Now that we're using "preferred local" policy for system default, we need to make this as fast as possible. Because of the variable size of the mempolicy structure [based on size of nodemasks], the preferred_node may be in a different cacheline from the mode. This can result in accessing an extra cacheline in the normal case of system default policy. Suspect this is the cause of an observed 2-3% slowdown in page fault testing relative to kernel without this patch series. To alleviate this, use an internal mode flag, MPOL_F_LOCAL in the mempolicy flags member which is guaranteed [?] to be in the same cacheline as the mode itself. Verified that reworked mempolicy now performs slightly better on 25-rc8-mm1 for both anon and shmem segments with system default and vma [preferred local] policy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> Documentation/vm/numa_memory_policy.txt | 11 +++---- include/linux/mempolicy.h | 1 mm/mempolicy.c | 45 ++++++++++++++------------------ 3 files changed, 27 insertions(+), 30 deletions(-) Index: linux-2.6.25-rc8-mm1/include/linux/mempolicy.h =================================================================== --- linux-2.6.25-rc8-mm1.orig/include/linux/mempolicy.h 2008-04-02 17:47:26.000000000 -0400 +++ linux-2.6.25-rc8-mm1/include/linux/mempolicy.h 2008-04-02 17:48:32.000000000 -0400 @@ -50,6 +50,7 @@ enum { * are never OR'ed into the mode in mempolicy API arguments. */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ +#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #ifdef __KERNEL__ Index: linux-2.6.25-rc8-mm1/mm/mempolicy.c =================================================================== --- linux-2.6.25-rc8-mm1.orig/mm/mempolicy.c 2008-04-02 17:47:41.000000000 -0400 +++ linux-2.6.25-rc8-mm1/mm/mempolicy.c 2008-04-02 17:51:58.000000000 -0400 @@ -110,7 +110,7 @@ enum zone_type policy_zone = 0; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_PREFERRED, - .v = { .preferred_node = -1 }, + .flags = MPOL_F_LOCAL, }; static const struct mempolicy_operations { @@ -163,7 +163,7 @@ static int mpol_new_interleave(struct me static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (!nodes) - pol->v.preferred_node = -1; /* local allocation */ + pol->flags |= MPOL_F_LOCAL; /* local allocation */ else if (nodes_empty(*nodes)) return -EINVAL; /* no allowed nodes */ else @@ -290,14 +290,15 @@ static void mpol_rebind_preferred(struct if (pol->flags & MPOL_F_STATIC_NODES) { int node = first_node(pol->w.user_nodemask); - if (node_isset(node, *nodes)) + if (node_isset(node, *nodes)) { pol->v.preferred_node = node; - else - pol->v.preferred_node = -1; + pol->flags &= ~MPOL_F_LOCAL; + } else + pol->flags |= MPOL_F_LOCAL; } else if (pol->flags & MPOL_F_RELATIVE_NODES) { mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); pol->v.preferred_node = first_node(tmp); - } else if (pol->v.preferred_node != -1) { + } else if (!(pol->flags & MPOL_F_LOCAL)) { pol->v.preferred_node = node_remap(pol->v.preferred_node, pol->w.cpuset_mems_allowed, *nodes); @@ -645,7 +646,7 @@ static void get_policy_nodemask(struct m *nodes = p->v.nodes; break; case MPOL_PREFERRED: - if (p->v.preferred_node >= 0) + if (!(p->flags & MPOL_F_LOCAL)) node_set(p->v.preferred_node, *nodes); /* else return empty node mask for local allocation */ break; @@ -1324,13 +1325,12 @@ static nodemask_t *policy_nodemask(gfp_t /* Return a zonelist indicated by gfp for node representing a mempolicy */ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) { - int nd; + int nd = numa_node_id(); switch (policy->mode) { case MPOL_PREFERRED: - nd = policy->v.preferred_node; - if (nd < 0) - nd = numa_node_id(); + if (!(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; break; case MPOL_BIND: /* @@ -1339,16 +1339,13 @@ static struct zonelist *policy_zonelist( * current node is part of the mask, we use the zonelist for * the first node in the mask instead. */ - nd = numa_node_id(); if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes))) nd = first_node(policy->v.nodes); break; case MPOL_INTERLEAVE: /* should not happen */ - nd = numa_node_id(); break; default: - nd = 0; BUG(); } return node_zonelist(nd, gfp); @@ -1379,14 +1376,15 @@ static unsigned interleave_nodes(struct */ unsigned slab_node(struct mempolicy *policy) { - if (!policy) + if (!policy || policy->flags & MPOL_F_LOCAL) return numa_node_id(); switch (policy->mode) { case MPOL_PREFERRED: - if (unlikely(policy->v.preferred_node >= 0)) - return policy->v.preferred_node; - return numa_node_id(); + /* + * handled MPOL_F_LOCAL above + */ + return policy->v.preferred_node; case MPOL_INTERLEAVE: return interleave_nodes(policy); @@ -1667,7 +1665,8 @@ int __mpol_equal(struct mempolicy *a, st case MPOL_INTERLEAVE: return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - return a->v.preferred_node == b->v.preferred_node; + return a->v.preferred_node == b->v.preferred_node && + a->flags == b->flags; default: BUG(); return 0; @@ -1947,7 +1946,7 @@ void numa_default_policy(void) } /* - * "local" is pseudo-policy: MPOL_PREFERRED with preferred_node == -1 + * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_to_str() */ #define MPOL_LOCAL (MPOL_INTERLEAVE + 1) @@ -1963,7 +1962,6 @@ static inline int mpol_to_str(char *buff { char *p = buffer; int l; - int nid; nodemask_t nodes; unsigned short mode; unsigned short flags = pol ? pol->flags : 0; @@ -1980,11 +1978,10 @@ static inline int mpol_to_str(char *buff case MPOL_PREFERRED: nodes_clear(nodes); - nid = pol->v.preferred_node; - if (nid < 0) + if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; /* pseudo-policy */ else - node_set(nid, nodes); + node_set(pol->v.preferred_node, nodes); break; case MPOL_BIND: Index: linux-2.6.25-rc8-mm1/Documentation/vm/numa_memory_policy.txt =================================================================== --- linux-2.6.25-rc8-mm1.orig/Documentation/vm/numa_memory_policy.txt 2008-04-02 17:47:37.000000000 -0400 +++ linux-2.6.25-rc8-mm1/Documentation/vm/numa_memory_policy.txt 2008-04-02 17:47:48.000000000 -0400 @@ -176,12 +176,11 @@ Components of Memory Policies containing the cpu where the allocation takes place. Internally, the Preferred policy uses a single node--the - preferred_node member of struct mempolicy. A "distinguished - value of this preferred_node, currently '-1', is interpreted - as "the node containing the cpu where the allocation takes - place"--local allocation. "Local" allocation policy can be - viewed as a Preferred policy that starts at the node containing - the cpu where the allocation takes place. + preferred_node member of struct mempolicy. When the internal + mode flag MPOL_F_LOCAL is set, the preferred_node is ignored and + the policy is interpreted as local allocation. "Local" allocation + policy can be viewed as a Preferred policy that starts at the node + containing the cpu where the allocation takes place. It is possible for the user to specify that local allocation is always preferred by passing an empty nodemask with this mode. -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html