From: Rakie Kim <rakie.kim@xxxxxx> This patch provides a way to set interleave weight information under sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN The sysfs structure is designed as follows. $ tree /sys/kernel/mm/mempolicy/ /sys/kernel/mm/mempolicy/ [1] └── weighted_interleave [2] ├── node0 [3] └── node1 Each file above can be explained as follows. [1] mm/mempolicy: configuration interface for mempolicy subsystem [2] weighted_interleave/: config interface for weighted interleave policy [3] weighted_interleave/nodeN: weight for nodeN If a node value is set to `0`, the system-default value will be used. As of this patch, the system-default for all nodes is always 1. Suggested-by: Huang Ying <ying.huang@xxxxxxxxx> Signed-off-by: Rakie Kim <rakie.kim@xxxxxx> Signed-off-by: Honggyu Kim <honggyu.kim@xxxxxx> Co-developed-by: Gregory Price <gregory.price@xxxxxxxxxxxx> Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx> --- .../ABI/testing/sysfs-kernel-mm-mempolicy | 4 + ...fs-kernel-mm-mempolicy-weighted-interleave | 26 ++ mm/mempolicy.c | 231 ++++++++++++++++++ 3 files changed, 261 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy new file mode 100644 index 000000000000..2dcf24f4384a --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy @@ -0,0 +1,4 @@ +What: /sys/kernel/mm/mempolicy/ +Date: December 2023 +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> +Description: Interface for Mempolicy diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave new file mode 100644 index 000000000000..e6a38139bf0f --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave @@ -0,0 +1,26 @@ +What: /sys/kernel/mm/mempolicy/weighted_interleave/ +Date: December 2023 +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> +Description: Configuration Interface for the Weighted Interleave policy + +What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN +Date: December 2023 +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> +Description: Weight configuration interface for nodeN + + The interleave weight for a memory node (N). These weights are + utilized by processes which have set their mempolicy to + MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by + omitting a task-local weight array. + + These weights only affect new allocations, and changes at runtime + will not cause migrations on already allocated pages. + + The minimum weight for a node is always 1. + + Minimum weight: 1 + Maximum weight: 255 + + Writing an empty string or `0` will reset the weight to the + system default. The system default may be set by the kernel + or drivers at boot or during hotplug events. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10a590ee1c89..ae925216798f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -131,6 +131,16 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/* + * iw_table is the sysfs-set interleave weight table, a value of 0 denotes + * system-default value should be used. Until system-defaults are implemented, + * the system-default is always 1. + * + * iw_table is RCU protected + */ +static u8 __rcu *iw_table; +static DEFINE_MUTEX(iw_table_lock); + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -3067,3 +3077,224 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +#ifdef CONFIG_SYSFS +struct iw_node_attr { + struct kobj_attribute kobj_attr; + int nid; +}; + +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct iw_node_attr *node_attr; + u8 weight; + u8 __rcu *table; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + + rcu_read_lock(); + table = rcu_dereference(iw_table); + weight = table ? table[node_attr->nid] : 1; + rcu_read_unlock(); + + return sysfs_emit(buf, "%d\n", weight); +} + +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct iw_node_attr *node_attr; + u8 __rcu *new; + u8 __rcu *old; + u8 weight = 0; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + if (count == 0 || sysfs_streq(buf, "")) + weight = 0; + else if (kstrtou8(buf, 0, &weight)) + return -EINVAL; + + /* + * The default weight is 1 (for now), when the kernel-internal + * default weight array is implemented, this should be updated to + * collect the system-default weight of the node if the user passes 0. + */ + if (!weight) + weight = 1; + + /* We only need to allocate up to the number of possible nodes */ + new = kmalloc(nr_node_ids, GFP_KERNEL); + if (!new) + return -ENOMEM; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + if (old) + memcpy(new, old, nr_node_ids); + else + memset(new, 1, nr_node_ids); + new[node_attr->nid] = weight; + rcu_assign_pointer(iw_table, new); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + return count; +} + +static struct iw_node_attr *node_attrs[MAX_NUMNODES]; + +static void sysfs_wi_node_release(struct iw_node_attr *node_attr, + struct kobject *parent) +{ + if (!node_attr) + return; + sysfs_remove_file(parent, &node_attr->kobj_attr.attr); + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); +} + +static void sysfs_wi_release(struct kobject *wi_kobj) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) + sysfs_wi_node_release(node_attrs[i], wi_kobj); + kobject_put(wi_kobj); +} + +static const struct kobj_type wi_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = sysfs_wi_release, +}; + +static int add_weight_node(int nid, struct kobject *wi_kobj) +{ + struct iw_node_attr *node_attr; + char *name; + + node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); + if (!node_attr) + return -ENOMEM; + + name = kasprintf(GFP_KERNEL, "node%d", nid); + if (!name) { + kfree(node_attr); + return -ENOMEM; + } + + sysfs_attr_init(&node_attr->kobj_attr.attr); + node_attr->kobj_attr.attr.name = name; + node_attr->kobj_attr.attr.mode = 0644; + node_attr->kobj_attr.show = node_show; + node_attr->kobj_attr.store = node_store; + node_attr->nid = nid; + + if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); + pr_err("failed to add attribute to weighted_interleave\n"); + return -ENOMEM; + } + + node_attrs[nid] = node_attr; + return 0; +} + +static int add_weighted_interleave_group(struct kobject *root_kobj) +{ + struct kobject *wi_kobj; + int nid, err; + + wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!wi_kobj) + return -ENOMEM; + + err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + "weighted_interleave"); + if (err) { + kfree(wi_kobj); + return err; + } + + memset(node_attrs, 0, sizeof(node_attrs)); + for_each_node_state(nid, N_POSSIBLE) { + err = add_weight_node(nid, wi_kobj); + if (err) { + pr_err("failed to add sysfs [node%d]\n", nid); + break; + } + } + if (err) + kobject_put(wi_kobj); + return 0; +} + +static void mempolicy_kobj_release(struct kobject *kobj) +{ + u8 __rcu *old; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + rcu_assign_pointer(iw_table, NULL); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + /* Never free the default table, it's always in use */ + kfree(old); + kfree(kobj); +} + +static const struct kobj_type mempolicy_ktype = { + .release = mempolicy_kobj_release +}; + +static struct kobject *mempolicy_kobj; +static int __init mempolicy_sysfs_init(void) +{ + int err; + struct kobject *mempolicy_kobj; + + /* A NULL iw_table is interpreted by interleave logic as "all 1s" */ + iw_table = NULL; + mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); + if (!mempolicy_kobj) { + pr_err("failed to add mempolicy kobject to the system\n"); + return -ENOMEM; + } + err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, + "mempolicy"); + if (err) { + kfree(mempolicy_kobj); + return err; + } + + err = add_weighted_interleave_group(mempolicy_kobj); + + if (err) { + kobject_put(mempolicy_kobj); + return err; + } + + return err; +} + +static void __exit mempolicy_exit(void) +{ + if (mempolicy_kobj) + kobject_put(mempolicy_kobj); +} + +#else +static int __init mempolicy_sysfs_init(void) +{ + /* A NULL iw_table is interpreted by interleave logic as "all 1s" */ + iw_table = NULL; + return 0; +} + +static void __exit mempolicy_exit(void) { } +#endif /* CONFIG_SYSFS */ +late_initcall(mempolicy_sysfs_init); +module_exit(mempolicy_exit); -- 2.39.1