On 7/25/22 2:05 PM, Huang, Ying wrote: > Aneesh Kumar K V <aneesh.kumar@xxxxxxxxxxxxx> writes: > >> On 7/25/22 12:07 PM, Huang, Ying wrote: >>> "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: >>> >>>> By default, all nodes are assigned to the default memory tier which >>>> is the memory tier designated for nodes with DRAM >>>> >>>> Set dax kmem device node's tier to slower memory tier by assigning >>>> performance level to MEMTIER_PERF_LEVEL_PMEM. PMEM tier >>>> appears below the default memory tier in demotion order. >>>> >>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> >>>> --- >>>> arch/powerpc/platforms/pseries/papr_scm.c | 41 ++++++++++++++++++++--- >>>> drivers/acpi/nfit/core.c | 41 ++++++++++++++++++++++- >>>> 2 files changed, 76 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c >>>> index 82cae08976bc..3b6164418d6f 100644 >>>> --- a/arch/powerpc/platforms/pseries/papr_scm.c >>>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c >>>> @@ -14,6 +14,8 @@ >>>> #include <linux/delay.h> >>>> #include <linux/seq_buf.h> >>>> #include <linux/nd.h> >>>> +#include <linux/memory.h> >>>> +#include <linux/memory-tiers.h> >>>> >>>> #include <asm/plpar_wrappers.h> >>>> #include <asm/papr_pdsm.h> >>>> @@ -98,6 +100,7 @@ struct papr_scm_priv { >>>> bool hcall_flush_required; >>>> >>>> uint64_t bound_addr; >>>> + int target_node; >>>> >>>> struct nvdimm_bus_descriptor bus_desc; >>>> struct nvdimm_bus *bus; >>>> @@ -1278,6 +1281,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) >>>> p->bus_desc.module = THIS_MODULE; >>>> p->bus_desc.of_node = p->pdev->dev.of_node; >>>> p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL); >>>> + p->target_node = dev_to_node(&p->pdev->dev); >>>> >>>> /* Set the dimm command family mask to accept PDSMs */ >>>> set_bit(NVDIMM_FAMILY_PAPR, &p->bus_desc.dimm_family_mask); >>>> @@ -1322,7 +1326,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) >>>> mapping.size = p->blocks * p->block_size; // XXX: potential overflow? >>>> >>>> memset(&ndr_desc, 0, sizeof(ndr_desc)); >>>> - target_nid = dev_to_node(&p->pdev->dev); >>>> + target_nid = p->target_node; >>>> online_nid = numa_map_to_online_node(target_nid); >>>> ndr_desc.numa_node = online_nid; >>>> ndr_desc.target_node = target_nid; >>>> @@ -1582,15 +1586,42 @@ static struct platform_driver papr_scm_driver = { >>>> }, >>>> }; >>>> >>>> +static int papr_scm_callback(struct notifier_block *self, >>>> + unsigned long action, void *arg) >>>> +{ >>>> + struct memory_notify *mnb = arg; >>>> + int nid = mnb->status_change_nid; >>>> + struct papr_scm_priv *p; >>>> + >>>> + if (nid == NUMA_NO_NODE || action != MEM_ONLINE) >>>> + return NOTIFY_OK; >>>> + >>>> + mutex_lock(&papr_ndr_lock); >>>> + list_for_each_entry(p, &papr_nd_regions, region_list) { >>>> + if (p->target_node == nid) { >>>> + node_devices[nid]->perf_level = MEMTIER_PERF_LEVEL_PMEM; >>>> + break; >>>> + } >>>> + } >>>> + >>>> + mutex_unlock(&papr_ndr_lock); >>>> + return NOTIFY_OK; >>>> +} >>>> + >>>> static int __init papr_scm_init(void) >>>> { >>>> int ret; >>>> >>>> ret = platform_driver_register(&papr_scm_driver); >>>> - if (!ret) >>>> - mce_register_notifier(&mce_ue_nb); >>>> - >>>> - return ret; >>>> + if (ret) >>>> + return ret; >>>> + mce_register_notifier(&mce_ue_nb); >>>> + /* >>>> + * register a memory hotplug notifier at prio 2 so that we >>>> + * can update the perf level for the node. >>>> + */ >>>> + hotplug_memory_notifier(papr_scm_callback, MEMTIER_HOTPLUG_PRIO + 1); >>>> + return 0; >>>> } >>>> module_init(papr_scm_init); >>>> >>>> diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c >>>> index ae5f4acf2675..7ea1017ef790 100644 >>>> --- a/drivers/acpi/nfit/core.c >>>> +++ b/drivers/acpi/nfit/core.c >>>> @@ -15,6 +15,8 @@ >>>> #include <linux/sort.h> >>>> #include <linux/io.h> >>>> #include <linux/nd.h> >>>> +#include <linux/memory.h> >>>> +#include <linux/memory-tiers.h> >>>> #include <asm/cacheflush.h> >>>> #include <acpi/nfit.h> >>>> #include "intel.h" >>>> @@ -3470,6 +3472,39 @@ static struct acpi_driver acpi_nfit_driver = { >>>> }, >>>> }; >>>> >>>> +static int nfit_callback(struct notifier_block *self, >>>> + unsigned long action, void *arg) >>>> +{ >>>> + bool found = false; >>>> + struct memory_notify *mnb = arg; >>>> + int nid = mnb->status_change_nid; >>>> + struct nfit_spa *nfit_spa; >>>> + struct acpi_nfit_desc *acpi_desc; >>>> + >>>> + if (nid == NUMA_NO_NODE || action != MEM_ONLINE) >>>> + return NOTIFY_OK; >>>> + >>>> + mutex_lock(&acpi_desc_lock); >>>> + list_for_each_entry(acpi_desc, &acpi_descs, list) { >>>> + mutex_lock(&acpi_desc->init_mutex); >>>> + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { >>>> + struct acpi_nfit_system_address *spa = nfit_spa->spa; >>>> + int target_node = pxm_to_node(spa->proximity_domain); >>>> + >>>> + if (target_node == nid) { >>>> + node_devices[nid]->perf_level = MEMTIER_PERF_LEVEL_PMEM; >>>> + found = true; >>>> + break; >>>> + } >>>> + } >>>> + mutex_unlock(&acpi_desc->init_mutex); >>>> + if (found) >>>> + break; >>>> + } >>>> + mutex_unlock(&acpi_desc_lock); >>>> + return NOTIFY_OK; >>>> +} >>>> + >>>> static __init int nfit_init(void) >>>> { >>>> int ret; >>>> @@ -3509,7 +3544,11 @@ static __init int nfit_init(void) >>>> nfit_mce_unregister(); >>>> destroy_workqueue(nfit_wq); >>>> } >>>> - >>>> + /* >>>> + * register a memory hotplug notifier at prio 2 so that we >>>> + * can update the perf level for the node. >>>> + */ >>>> + hotplug_memory_notifier(nfit_callback, MEMTIER_HOTPLUG_PRIO + 1); >>>> return ret; >>>> >>>> } >>> >>> I don't think that it's a good idea to set perf_level of a memory device >>> (node) via NFIT only. >> >>> >>> For example, we may prefer HMAT over NFIT when it's available. So the >>> perf_level should be set in dax/kmem.c based on information provided by >>> ACPI or other information sources. ACPI can provide some functions/data >>> structures to let drivers (like dax/kmem.c) to query the properties of >>> the memory device (node). >>> >> >> I was trying to make it architecture specific so that we have a placeholder >> to fine-tune this better. For example, ppc64 will look at device tree >> details to find the performance level and x86 will look at ACPI data structure. >> Adding that hotplug callback in dax/kmem will prevent that architecture-specific >> customization? >> >> I would expect that callback to move to the generic ACPI layer so that even >> firmware managed CXL devices can be added to a lower tier? I don't understand >> ACPI enough to find the right abstraction for that hotplug callback. > > I'm OK for this to be architecture specific. > > But ACPI NFIT isn't enough for x86. For example, PMEM can be added to a > virtual machine as normal memory nodes without NFIT. Instead, PMEM is > marked via "memmap=<nn>G!<ss>G" or "efi_fake_mem=<nn>G@<ss>G:0x40000", > and dax/kmem.c is used to hot-add the memory. > > So, before a more sophisticated version is implemented for x86. The > simplest version as I suggested below works even better. > >>> As the simplest first version, this can be just hard coded. >>> >> >> If you are suggesting to not use hotplug callback, one of the challenge was node_devices[nid] >> get allocated pretty late when we try to online the node. > > As the simplest first version, this can be as simple as, > > /* dax/kmem.c */ > static int dev_dax_kmem_probe(struct dev_dax *dev_dax) > { > node_devices[dev_dax->target_node]->perf_level = MEMTIER_PERF_LEVEL_PMEM; > /* add_memory_driver_managed() */ > } > > To be compatible with ppc64 version, how about make dev_dax_kmem_probe() > set perf_level only if it's uninitialized? That will result in kernel crash because node_devices[dev_dax->target_node] is not initialized there. it get allocated in add_memory_resource -> __try_online_node -> register_one_node -> __register_one_node -> node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL); -aneesh