On Mon, Jun 24, 2019 at 8:34 PM Dan Williams <dan.j.williams@xxxxxxxxx> wrote: > > Memory that has been tagged EFI_MEMORY_SP, and has performance > properties described by the ACPI HMAT is expected to have an application > specific consumer. > > Those consumers may want 100% of the memory capacity to be reserved from > any usage by the kernel. By default, with this enabling, a platform > device is created to represent this differentiated resource. > > The device-dax "hmem" driver claims these devices by default and > provides an mmap interface for the target application. If the > administrator prefers, the hmem resource range can be made available to > the core-mm via the device-dax hotplug facility, kmem, to online the > memory with its own numa node. > > This was tested with an emulated HMAT produced by qemu (with the pending > HMAT enabling patches), and "efi_fake_mem=8G@9G:0x40000" on the kernel > command line to mark the memory ranges associated with node2 and node3 > as EFI_MEMORY_SP. > > qemu numa configuration options: > > -numa node,mem=4G,cpus=0-19,nodeid=0 > -numa node,mem=4G,cpus=20-39,nodeid=1 > -numa node,mem=4G,nodeid=2 > -numa node,mem=4G,nodeid=3 > -numa dist,src=0,dst=0,val=10 > -numa dist,src=0,dst=1,val=21 > -numa dist,src=0,dst=2,val=21 > -numa dist,src=0,dst=3,val=21 > -numa dist,src=1,dst=0,val=21 > -numa dist,src=1,dst=1,val=10 > -numa dist,src=1,dst=2,val=21 > -numa dist,src=1,dst=3,val=21 > -numa dist,src=2,dst=0,val=21 > -numa dist,src=2,dst=1,val=21 > -numa dist,src=2,dst=2,val=10 > -numa dist,src=2,dst=3,val=21 > -numa dist,src=3,dst=0,val=21 > -numa dist,src=3,dst=1,val=21 > -numa dist,src=3,dst=2,val=21 > -numa dist,src=3,dst=3,val=10 > -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5 > -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5 > -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10 > -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10 > -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15 > -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15 > -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20 > -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20 > -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10 > -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10 > -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5 > -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5 > -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15 > -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15 > -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20 > -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20 > > Result: > > # daxctl list -RDu > [ > { > "path":"\/platform\/hmem.1", > "id":1, > "size":"4.00 GiB (4.29 GB)", > "align":2097152, > "devices":[ > { > "chardev":"dax1.0", > "size":"4.00 GiB (4.29 GB)" > } > ] > }, > { > "path":"\/platform\/hmem.0", > "id":0, > "size":"4.00 GiB (4.29 GB)", > "align":2097152, > "devices":[ > { > "chardev":"dax0.0", > "size":"4.00 GiB (4.29 GB)" > } > ] > } > ] > > # cat /proc/iomem > [..] > 240000000-43fffffff : Application Reserved > 240000000-33fffffff : hmem.0 > 240000000-33fffffff : dax0.0 > 340000000-43fffffff : hmem.1 > 340000000-43fffffff : dax1.0 > > Cc: Len Brown <lenb@xxxxxxxxxx> > Cc: Keith Busch <keith.busch@xxxxxxxxx> > Cc: "Rafael J. Wysocki" <rjw@xxxxxxxxxxxxx> > Cc: Vishal Verma <vishal.l.verma@xxxxxxxxx> > Cc: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx> > Reviewed-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> Acked-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx> > --- > drivers/acpi/numa/Kconfig | 1 > drivers/acpi/numa/hmat.c | 132 +++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 122 insertions(+), 11 deletions(-) > > diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig > index d14582387ed0..c1be746e111a 100644 > --- a/drivers/acpi/numa/Kconfig > +++ b/drivers/acpi/numa/Kconfig > @@ -8,6 +8,7 @@ config ACPI_HMAT > bool "ACPI Heterogeneous Memory Attribute Table Support" > depends on ACPI_NUMA > select HMEM_REPORTING > + select MEMREGION > help > If set, this option has the kernel parse and report the > platform's ACPI HMAT (Heterogeneous Memory Attributes Table), > diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c > index 1d329c4af3bf..622c5feb3aa0 100644 > --- a/drivers/acpi/numa/hmat.c > +++ b/drivers/acpi/numa/hmat.c > @@ -8,11 +8,16 @@ > * the applicable attributes with the node's interfaces. > */ > > +#define pr_fmt(fmt) "acpi/hmat: " fmt > +#define dev_fmt(fmt) "acpi/hmat: " fmt > + > #include <linux/acpi.h> > #include <linux/bitops.h> > #include <linux/device.h> > #include <linux/init.h> > #include <linux/list.h> > +#include <linux/mm.h> > +#include <linux/platform_device.h> > #include <linux/list_sort.h> > #include <linux/node.h> > #include <linux/sysfs.h> > @@ -40,6 +45,7 @@ struct memory_target { > struct list_head node; > unsigned int memory_pxm; > unsigned int processor_pxm; > + struct resource memregions; > struct node_hmem_attrs hmem_attrs; > }; > > @@ -92,21 +98,35 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm) > list_add_tail(&initiator->node, &initiators); > } > > -static __init void alloc_memory_target(unsigned int mem_pxm) > +static __init void alloc_memory_target(unsigned int mem_pxm, > + resource_size_t start, resource_size_t len) > { > struct memory_target *target; > > target = find_mem_target(mem_pxm); > - if (target) > - return; > - > - target = kzalloc(sizeof(*target), GFP_KERNEL); > - if (!target) > - return; > + if (!target) { > + target = kzalloc(sizeof(*target), GFP_KERNEL); > + if (!target) > + return; > + target->memory_pxm = mem_pxm; > + target->processor_pxm = PXM_INVAL; > + target->memregions = (struct resource) { > + .name = "ACPI mem", > + .start = 0, > + .end = -1, > + .flags = IORESOURCE_MEM, > + }; > + list_add_tail(&target->node, &targets); > + } > > - target->memory_pxm = mem_pxm; > - target->processor_pxm = PXM_INVAL; > - list_add_tail(&target->node, &targets); > + /* > + * There are potentially multiple ranges per PXM, so record each > + * in the per-target memregions resource tree. > + */ > + if (!__request_region(&target->memregions, start, len, "memory target", > + IORESOURCE_MEM)) > + pr_warn("failed to reserve %#llx - %#llx in pxm: %d\n", > + start, start + len, mem_pxm); > } > > static __init const char *hmat_data_type(u8 type) > @@ -428,7 +448,7 @@ static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header, > return -EINVAL; > if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) > return 0; > - alloc_memory_target(ma->proximity_domain); > + alloc_memory_target(ma->proximity_domain, ma->base_address, ma->length); > return 0; > } > > @@ -580,6 +600,81 @@ static __init void hmat_register_target_perf(struct memory_target *target) > node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); > } > > +static __init void hmat_register_target_device(struct memory_target *target, > + struct resource *r) > +{ > + /* define a clean / non-busy resource for the platform device */ > + struct resource res = { > + .start = r->start, > + .end = r->end, > + .flags = IORESOURCE_MEM, > + }; > + struct platform_device *pdev; > + struct memregion_info info; > + int rc, id; > + > + rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, > + IORES_DESC_APPLICATION_RESERVED); > + if (rc != REGION_INTERSECTS) > + return; > + > + id = memregion_alloc(GFP_KERNEL); > + if (id < 0) { > + pr_err("memregion allocation failure for %pr\n", &res); > + return; > + } > + > + pdev = platform_device_alloc("hmem", id); > + if (!pdev) { > + pr_err("hmem device allocation failure for %pr\n", &res); > + goto out_pdev; > + } > + > + pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm); > + info = (struct memregion_info) { > + .target_node = acpi_map_pxm_to_node(target->memory_pxm), > + }; > + rc = platform_device_add_data(pdev, &info, sizeof(info)); > + if (rc < 0) { > + pr_err("hmem memregion_info allocation failure for %pr\n", &res); > + goto out_pdev; > + } > + > + rc = platform_device_add_resources(pdev, &res, 1); > + if (rc < 0) { > + pr_err("hmem resource allocation failure for %pr\n", &res); > + goto out_resource; > + } > + > + rc = platform_device_add(pdev); > + if (rc < 0) { > + dev_err(&pdev->dev, "device add failed for %pr\n", &res); > + goto out_resource; > + } > + > + return; > + > +out_resource: > + put_device(&pdev->dev); > +out_pdev: > + memregion_free(id); > +} > + > +static __init void hmat_register_target_devices(struct memory_target *target) > +{ > + struct resource *res; > + > + /* > + * Do not bother creating devices if no driver is available to > + * consume them. > + */ > + if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) > + return; > + > + for (res = target->memregions.child; res; res = res->sibling) > + hmat_register_target_device(target, res); > +} > + > static __init void hmat_register_targets(void) > { > struct memory_target *target; > @@ -587,6 +682,12 @@ static __init void hmat_register_targets(void) > list_for_each_entry(target, &targets, node) { > int nid = pxm_to_node(target->memory_pxm); > > + /* > + * Devices may belong to either an offline or online > + * node, so unconditionally add them. > + */ > + hmat_register_target_devices(target); > + > /* > * Skip offline nodes. This can happen when memory > * marked EFI_MEMORY_SP, "specific purpose", is applied > @@ -608,7 +709,16 @@ static __init void hmat_free_structures(void) > struct memory_initiator *initiator, *inext; > > list_for_each_entry_safe(target, tnext, &targets, node) { > + struct resource *res, *res_next; > + > list_del(&target->node); > + res = target->memregions.child; > + while (res) { > + res_next = res->sibling; > + __release_region(&target->memregions, res->start, > + resource_size(res)); > + res = res_next; > + } > kfree(target); > } > >