Gregory Price <gourry.memverge@xxxxxxxxx> writes: > From: Rakie Kim <rakie.kim@xxxxxx> > > This patch provides a way to set interleave weight information under > sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN > > The sysfs structure is designed as follows. > > $ tree /sys/kernel/mm/mempolicy/ > /sys/kernel/mm/mempolicy/ [1] > └── weighted_interleave [2] > ├── node0 [3] > └── node1 > > Each file above can be explained as follows. > > [1] mm/mempolicy: configuration interface for mempolicy subsystem > > [2] weighted_interleave/: config interface for weighted interleave policy > > [3] weighted_interleave/nodeN: weight for nodeN > > If a node value is set to `0`, the system-default value will be used. > As of this patch, the system-default for all nodes is always 1. > > Suggested-by: Huang Ying <ying.huang@xxxxxxxxx> > Signed-off-by: Rakie Kim <rakie.kim@xxxxxx> > Signed-off-by: Honggyu Kim <honggyu.kim@xxxxxx> > Co-developed-by: Gregory Price <gregory.price@xxxxxxxxxxxx> > Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx> > Co-developed-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx> > Signed-off-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx> > --- > .../ABI/testing/sysfs-kernel-mm-mempolicy | 4 + > ...fs-kernel-mm-mempolicy-weighted-interleave | 26 ++ > mm/mempolicy.c | 231 ++++++++++++++++++ > 3 files changed, 261 insertions(+) > create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy > create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave > > diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy > new file mode 100644 > index 000000000000..2dcf24f4384a > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy > @@ -0,0 +1,4 @@ > +What: /sys/kernel/mm/mempolicy/ > +Date: December 2023 > +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> > +Description: Interface for Mempolicy > diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave > new file mode 100644 > index 000000000000..e6a38139bf0f > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave > @@ -0,0 +1,26 @@ > +What: /sys/kernel/mm/mempolicy/weighted_interleave/ > +Date: December 2023 May be not a big deal. The date should be "January 2024"? > +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> > +Description: Configuration Interface for the Weighted Interleave policy > + > +What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN > +Date: December 2023 > +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> > +Description: Weight configuration interface for nodeN > + > + The interleave weight for a memory node (N). These weights are > + utilized by processes which have set their mempolicy to s/processes/tasks or memory areas/ > + MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by > + omitting a task-local weight array. Now, we haven't introduced task-local weight array. So, leave this until we introduce that? > + > + These weights only affect new allocations, and changes at runtime > + will not cause migrations on already allocated pages. > + > + The minimum weight for a node is always 1. > + > + Minimum weight: 1 > + Maximum weight: 255 > + > + Writing an empty string or `0` will reset the weight to the > + system default. The system default may be set by the kernel > + or drivers at boot or during hotplug events. > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 10a590ee1c89..ae925216798f 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -131,6 +131,16 @@ static struct mempolicy default_policy = { > > static struct mempolicy preferred_node_policy[MAX_NUMNODES]; > > +/* > + * iw_table is the sysfs-set interleave weight table, a value of 0 denotes > + * system-default value should be used. Until system-defaults are implemented, > + * the system-default is always 1. > + * > + * iw_table is RCU protected > + */ > +static u8 __rcu *iw_table; > +static DEFINE_MUTEX(iw_table_lock); > + > /** > * numa_nearest_node - Find nearest node by state > * @node: Node id to start the search > @@ -3067,3 +3077,224 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) > p += scnprintf(p, buffer + maxlen - p, ":%*pbl", > nodemask_pr_args(&nodes)); > } > + > +#ifdef CONFIG_SYSFS > +struct iw_node_attr { > + struct kobj_attribute kobj_attr; > + int nid; > +}; > + > +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, > + char *buf) > +{ > + struct iw_node_attr *node_attr; > + u8 weight; > + u8 __rcu *table; > + > + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); > + > + rcu_read_lock(); > + table = rcu_dereference(iw_table); > + weight = table ? table[node_attr->nid] : 1; > + rcu_read_unlock(); > + > + return sysfs_emit(buf, "%d\n", weight); > +} > + > +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, > + const char *buf, size_t count) > +{ > + struct iw_node_attr *node_attr; > + u8 __rcu *new; > + u8 __rcu *old; > + u8 weight = 0; > + > + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); > + if (count == 0 || sysfs_streq(buf, "")) > + weight = 0; > + else if (kstrtou8(buf, 0, &weight)) > + return -EINVAL; > + > + /* > + * The default weight is 1 (for now), when the kernel-internal > + * default weight array is implemented, this should be updated to > + * collect the system-default weight of the node if the user passes 0. > + */ > + if (!weight) > + weight = 1; >From functionality point of view, it's OK to set "weight = 1" here now. But when we add system default weight table in the future, we need to use "weight = 0". Otherwise, we cannot distinguish whether the default value have been customized via sysfs. So, I suggest to use that rule. > + > + /* We only need to allocate up to the number of possible nodes */ This comment appears not necessary. > + new = kmalloc(nr_node_ids, GFP_KERNEL); > + if (!new) > + return -ENOMEM; > + > + mutex_lock(&iw_table_lock); > + old = rcu_dereference_protected(iw_table, > + lockdep_is_held(&iw_table_lock)); > + if (old) > + memcpy(new, old, nr_node_ids); > + else > + memset(new, 1, nr_node_ids); With similar reason as above ("From functionality..."), I suggest to set "0" here. > + new[node_attr->nid] = weight; > + rcu_assign_pointer(iw_table, new); > + mutex_unlock(&iw_table_lock); > + synchronize_rcu(); > + kfree(old); > + return count; > +} > + > +static struct iw_node_attr *node_attrs[MAX_NUMNODES]; node_attrs[] can be allocated dynamically too. Just a suggestion. > + > +static void sysfs_wi_node_release(struct iw_node_attr *node_attr, > + struct kobject *parent) > +{ > + if (!node_attr) > + return; > + sysfs_remove_file(parent, &node_attr->kobj_attr.attr); > + kfree(node_attr->kobj_attr.attr.name); > + kfree(node_attr); > +} > + > +static void sysfs_wi_release(struct kobject *wi_kobj) > +{ > + int i; > + > + for (i = 0; i < MAX_NUMNODES; i++) Nitpick, nr_node_ids should be OK here. > + sysfs_wi_node_release(node_attrs[i], wi_kobj); > + kobject_put(wi_kobj); > +} > + > +static const struct kobj_type wi_ktype = { > + .sysfs_ops = &kobj_sysfs_ops, > + .release = sysfs_wi_release, > +}; > + > +static int add_weight_node(int nid, struct kobject *wi_kobj) > +{ > + struct iw_node_attr *node_attr; > + char *name; > + > + node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); > + if (!node_attr) > + return -ENOMEM; > + > + name = kasprintf(GFP_KERNEL, "node%d", nid); > + if (!name) { > + kfree(node_attr); > + return -ENOMEM; > + } > + > + sysfs_attr_init(&node_attr->kobj_attr.attr); > + node_attr->kobj_attr.attr.name = name; > + node_attr->kobj_attr.attr.mode = 0644; > + node_attr->kobj_attr.show = node_show; > + node_attr->kobj_attr.store = node_store; > + node_attr->nid = nid; > + > + if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { > + kfree(node_attr->kobj_attr.attr.name); > + kfree(node_attr); > + pr_err("failed to add attribute to weighted_interleave\n"); > + return -ENOMEM; > + } > + > + node_attrs[nid] = node_attr; > + return 0; > +} > + > +static int add_weighted_interleave_group(struct kobject *root_kobj) > +{ > + struct kobject *wi_kobj; > + int nid, err; > + > + wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); > + if (!wi_kobj) > + return -ENOMEM; > + > + err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, > + "weighted_interleave"); > + if (err) { > + kfree(wi_kobj); > + return err; > + } > + > + memset(node_attrs, 0, sizeof(node_attrs)); > + for_each_node_state(nid, N_POSSIBLE) { > + err = add_weight_node(nid, wi_kobj); > + if (err) { > + pr_err("failed to add sysfs [node%d]\n", nid); > + break; > + } > + } > + if (err) > + kobject_put(wi_kobj); > + return 0; > +} > + > +static void mempolicy_kobj_release(struct kobject *kobj) > +{ > + u8 __rcu *old; > + > + mutex_lock(&iw_table_lock); > + old = rcu_dereference_protected(iw_table, > + lockdep_is_held(&iw_table_lock)); > + rcu_assign_pointer(iw_table, NULL); > + mutex_unlock(&iw_table_lock); > + synchronize_rcu(); > + /* Never free the default table, it's always in use */ Obsolete comment? > + kfree(old); It appears unnecessary to free iw_table in error path. But this isn't a big deal because error path will almost never be executed in practice. > + kfree(kobj); > +} > + > +static const struct kobj_type mempolicy_ktype = { > + .release = mempolicy_kobj_release > +}; > + > +static struct kobject *mempolicy_kobj; > +static int __init mempolicy_sysfs_init(void) > +{ > + int err; > + struct kobject *mempolicy_kobj; This overrides the global "mempolicy_kobj" defined before function. But I don't think we need the global definition. > + > + /* A NULL iw_table is interpreted by interleave logic as "all 1s" */ As I suggested above, it will be "all 0s", that is, use default weight. > + iw_table = NULL; The default value is NULL already, it appears unnecessary to do this. > + mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); > + if (!mempolicy_kobj) { > + pr_err("failed to add mempolicy kobject to the system\n"); > + return -ENOMEM; > + } > + err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, > + "mempolicy"); > + if (err) { > + kfree(mempolicy_kobj); > + return err; > + } > + > + err = add_weighted_interleave_group(mempolicy_kobj); > + > + if (err) { > + kobject_put(mempolicy_kobj); > + return err; > + } > + > + return err; > +} > + > +static void __exit mempolicy_exit(void) > +{ > + if (mempolicy_kobj) > + kobject_put(mempolicy_kobj); > +} > + > +#else > +static int __init mempolicy_sysfs_init(void) > +{ > + /* A NULL iw_table is interpreted by interleave logic as "all 1s" */ > + iw_table = NULL; > + return 0; > +} > + > +static void __exit mempolicy_exit(void) { } > +#endif /* CONFIG_SYSFS */ > +late_initcall(mempolicy_sysfs_init); > +module_exit(mempolicy_exit); mempolicy.c will not be compiled as module, so we don't need module_exit(). -- Best Regards, Huang, Ying