Gregory Price <gourry.memverge@xxxxxxxxx> writes: > From: Rakie Kim <rakie.kim@xxxxxx> > > This patch provides a way to set interleave weight information under > sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN > > The sysfs structure is designed as follows. > > $ tree /sys/kernel/mm/mempolicy/ > /sys/kernel/mm/mempolicy/ [1] > └── weighted_interleave [2] > ├── node0 [3] > └── node1 > > Each file above can be explained as follows. > > [1] mm/mempolicy: configuration interface for mempolicy subsystem > > [2] weighted_interleave/: config interface for weighted interleave policy > > [3] weighted_interleave/nodeN: weight for nodeN > > If sysfs is disabled in the config, the global interleave weights > will default to "1" for all nodes. > > Signed-off-by: Rakie Kim <rakie.kim@xxxxxx> > Signed-off-by: Honggyu Kim <honggyu.kim@xxxxxx> > Co-developed-by: Gregory Price <gregory.price@xxxxxxxxxxxx> > Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx> > Co-developed-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx> > Signed-off-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx> > --- > .../ABI/testing/sysfs-kernel-mm-mempolicy | 4 + > ...fs-kernel-mm-mempolicy-weighted-interleave | 22 +++ > mm/mempolicy.c | 156 ++++++++++++++++++ > 3 files changed, 182 insertions(+) > create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy > create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave > > diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy > new file mode 100644 > index 000000000000..2dcf24f4384a > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy > @@ -0,0 +1,4 @@ > +What: /sys/kernel/mm/mempolicy/ > +Date: December 2023 > +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> > +Description: Interface for Mempolicy > diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave > new file mode 100644 > index 000000000000..aa27fdf08c19 > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave > @@ -0,0 +1,22 @@ > +What: /sys/kernel/mm/mempolicy/weighted_interleave/ > +Date: December 2023 > +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> > +Description: Configuration Interface for the Weighted Interleave policy > + > +What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN > +Date: December 2023 > +Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx> > +Description: Weight configuration interface for nodeN > + > + The interleave weight for a memory node (N). These weights are > + utilized by processes which have set their mempolicy to > + MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by > + omitting a task-local weight array. > + > + These weights only affect new allocations, and changes at runtime > + will not cause migrations on already allocated pages. > + > + Writing an empty string resets the weight value to 1. I still think that it's a good idea to provide some better default weight value with HMAT or CDAT if available. So, better not to make "1" as part of ABI? > + > + Minimum weight: 1 Can weight be "0"? Do we need a way to specify that a node don't want to participate weighted interleave? > + Maximum weight: 255 > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 10a590ee1c89..0e77633b07a5 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -131,6 +131,8 @@ static struct mempolicy default_policy = { > > static struct mempolicy preferred_node_policy[MAX_NUMNODES]; > > +static char iw_table[MAX_NUMNODES]; > + It's kind of obscure whether "char" is "signed" or "unsigned". Given the max weight is 255 above, it's better to use "u8"? And, we may need a way to specify whether the weight has been overridden by the user. A special value (such as 255) can be used for that. If so, the maximum weight should be 254 instead of 255. As a user space interface, is it better to use 100 as the maximum value? > /** > * numa_nearest_node - Find nearest node by state > * @node: Node id to start the search > @@ -3067,3 +3069,157 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) > p += scnprintf(p, buffer + maxlen - p, ":%*pbl", > nodemask_pr_args(&nodes)); > } > + > +#ifdef CONFIG_SYSFS > +struct iw_node_attr { > + struct kobj_attribute kobj_attr; > + int nid; > +}; > + > +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, > + char *buf) > +{ > + struct iw_node_attr *node_attr; > + > + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); > + return sysfs_emit(buf, "%d\n", iw_table[node_attr->nid]); > +} > + > +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, > + const char *buf, size_t count) > +{ > + struct iw_node_attr *node_attr; > + unsigned char weight = 0; > + > + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); > + /* If no input, set default weight to 1 */ > + if (count == 0 || sysfs_streq(buf, "")) > + weight = 1; > + else if (kstrtou8(buf, 0, &weight) || !weight) > + return -EINVAL; > + > + iw_table[node_attr->nid] = weight; kstrtou8(), "unsigned char weight", "char iw_table[]" isn't completely consistent. It's better to make them consistent? > + return count; > +} > + > +static struct iw_node_attr *node_attrs[MAX_NUMNODES]; > + > +static void sysfs_wi_node_release(struct iw_node_attr *node_attr, > + struct kobject *parent) > +{ > + if (!node_attr) > + return; > + sysfs_remove_file(parent, &node_attr->kobj_attr.attr); > + kfree(node_attr->kobj_attr.attr.name); > + kfree(node_attr); > +} > + > +static void sysfs_mempolicy_release(struct kobject *mempolicy_kobj) > +{ > + int i; > + > + for (i = 0; i < MAX_NUMNODES; i++) > + sysfs_wi_node_release(node_attrs[i], mempolicy_kobj); IIUC, if this is called in error path (such as, in add_weighted_interleave_group()), some node_attrs[] element may be "NULL"? > + kobject_put(mempolicy_kobj); > +} > + > +static const struct kobj_type mempolicy_ktype = { > + .sysfs_ops = &kobj_sysfs_ops, > + .release = sysfs_mempolicy_release, > +}; > + > +static int add_weight_node(int nid, struct kobject *wi_kobj) > +{ > + struct iw_node_attr *node_attr; > + char *name; > + > + node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); > + if (!node_attr) > + return -ENOMEM; > + > + name = kasprintf(GFP_KERNEL, "node%d", nid); > + if (!name) { > + kfree(node_attr); > + return -ENOMEM; > + } > + > + sysfs_attr_init(&node_attr->kobj_attr.attr); > + node_attr->kobj_attr.attr.name = name; > + node_attr->kobj_attr.attr.mode = 0644; > + node_attr->kobj_attr.show = node_show; > + node_attr->kobj_attr.store = node_store; > + node_attr->nid = nid; > + > + if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { > + kfree(node_attr->kobj_attr.attr.name); > + kfree(node_attr); > + pr_err("failed to add attribute to weighted_interleave\n"); > + return -ENOMEM; > + } > + > + node_attrs[nid] = node_attr; > + return 0; > +} > + > +static int add_weighted_interleave_group(struct kobject *root_kobj) > +{ > + struct kobject *wi_kobj; > + int nid, err; > + > + wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); > + if (!wi_kobj) > + return -ENOMEM; > + > + err = kobject_init_and_add(wi_kobj, &mempolicy_ktype, root_kobj, > + "weighted_interleave"); > + if (err) { > + kfree(wi_kobj); > + return err; > + } > + > + memset(node_attrs, 0, sizeof(node_attrs)); > + for_each_node_state(nid, N_POSSIBLE) { > + err = add_weight_node(nid, wi_kobj); > + if (err) { > + pr_err("failed to add sysfs [node%d]\n", nid); > + break; > + } > + } > + if (err) > + kobject_put(wi_kobj); > + return 0; > +} > + > +static int __init mempolicy_sysfs_init(void) > +{ > + int err; > + struct kobject *root_kobj; > + > + memset(&iw_table, 1, sizeof(iw_table)); > + > + root_kobj = kobject_create_and_add("mempolicy", mm_kobj); > + if (!root_kobj) { > + pr_err("failed to add mempolicy kobject to the system\n"); > + return -ENOMEM; > + } > + > + err = add_weighted_interleave_group(root_kobj); > + > + if (err) > + kobject_put(root_kobj); > + return err; > + > +} > +#else > +static int __init mempolicy_sysfs_init(void) > +{ > + /* > + * if sysfs is not enabled MPOL_WEIGHTED_INTERLEAVE defaults to > + * MPOL_INTERLEAVE behavior, but is still defined separately to > + * allow task-local weighted interleave to operate as intended. > + */ > + memset(&iw_table, 1, sizeof(iw_table)); > + return 0; > +} > +#endif /* CONFIG_SYSFS */ > +late_initcall(mempolicy_sysfs_init); -- Best Regards, Huang, Ying