Re: [PATCH v7 12/12] acpi/numa/hmat: Register "soft reserved" memory as an "hmem" device

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Oct 16, 2019 at 3:14 AM Dan Williams <dan.j.williams@xxxxxxxxx> wrote:
>
> Memory that has been tagged EFI_MEMORY_SP, and has performance
> properties described by the ACPI HMAT is expected to have an application
> specific consumer.
>
> Those consumers may want 100% of the memory capacity to be reserved from
> any usage by the kernel. By default, with this enabling, a platform
> device is created to represent this differentiated resource.
>
> The device-dax "hmem" driver claims these devices by default and
> provides an mmap interface for the target application.  If the
> administrator prefers, the hmem resource range can be made available to
> the core-mm via the device-dax hotplug facility, kmem, to online the
> memory with its own numa node.
>
> This was tested with an emulated HMAT produced by qemu (with the pending
> HMAT enabling patches), and "efi_fake_mem=8G@9G:0x40000" on the kernel
> command line to mark the memory ranges associated with node2 and node3
> as EFI_MEMORY_SP.
>
> qemu numa configuration options:
>
> -numa node,mem=4G,cpus=0-19,nodeid=0
> -numa node,mem=4G,cpus=20-39,nodeid=1
> -numa node,mem=4G,nodeid=2
> -numa node,mem=4G,nodeid=3
> -numa dist,src=0,dst=0,val=10
> -numa dist,src=0,dst=1,val=21
> -numa dist,src=0,dst=2,val=21
> -numa dist,src=0,dst=3,val=21
> -numa dist,src=1,dst=0,val=21
> -numa dist,src=1,dst=1,val=10
> -numa dist,src=1,dst=2,val=21
> -numa dist,src=1,dst=3,val=21
> -numa dist,src=2,dst=0,val=21
> -numa dist,src=2,dst=1,val=21
> -numa dist,src=2,dst=2,val=10
> -numa dist,src=2,dst=3,val=21
> -numa dist,src=3,dst=0,val=21
> -numa dist,src=3,dst=1,val=21
> -numa dist,src=3,dst=2,val=21
> -numa dist,src=3,dst=3,val=10
> -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5
> -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5
> -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10
> -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10
> -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15
> -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15
> -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20
> -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20
> -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10
> -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10
> -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5
> -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5
> -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15
> -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15
> -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20
> -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20
>
> Result:
>
> # daxctl list -RDu
> [
>   {
>     "path":"\/platform\/hmem.1",
>     "id":1,
>     "size":"4.00 GiB (4.29 GB)",
>     "align":2097152,
>     "devices":[
>       {
>         "chardev":"dax1.0",
>         "size":"4.00 GiB (4.29 GB)"
>       }
>     ]
>   },
>   {
>     "path":"\/platform\/hmem.0",
>     "id":0,
>     "size":"4.00 GiB (4.29 GB)",
>     "align":2097152,
>     "devices":[
>       {
>         "chardev":"dax0.0",
>         "size":"4.00 GiB (4.29 GB)"
>       }
>     ]
>   }
> ]
>
> # cat /proc/iomem
> [..]
> 240000000-43fffffff : Soft Reserved
>   240000000-33fffffff : hmem.0
>     240000000-33fffffff : dax0.0
>   340000000-43fffffff : hmem.1
>     340000000-43fffffff : dax1.0
>
> Cc: Len Brown <lenb@xxxxxxxxxx>
> Cc: Keith Busch <kbusch@xxxxxxxxxx>
> Cc: "Rafael J. Wysocki" <rjw@xxxxxxxxxxxxx>
> Cc: Vishal Verma <vishal.l.verma@xxxxxxxxx>
> Cc: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx>
> Reviewed-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>

> ---
>  drivers/acpi/numa/Kconfig |    1
>  drivers/acpi/numa/hmat.c  |  136 +++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 125 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig
> index acbd5aa76e40..fcf2e556d69d 100644
> --- a/drivers/acpi/numa/Kconfig
> +++ b/drivers/acpi/numa/Kconfig
> @@ -9,6 +9,7 @@ config ACPI_HMAT
>         bool "ACPI Heterogeneous Memory Attribute Table Support"
>         depends on ACPI_NUMA
>         select HMEM_REPORTING
> +       select MEMREGION
>         help
>          If set, this option has the kernel parse and report the
>          platform's ACPI HMAT (Heterogeneous Memory Attributes Table),
> diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
> index 4707eb9dd07b..eaa5a0f93dec 100644
> --- a/drivers/acpi/numa/hmat.c
> +++ b/drivers/acpi/numa/hmat.c
> @@ -8,12 +8,18 @@
>   * the applicable attributes with the node's interfaces.
>   */
>
> +#define pr_fmt(fmt) "acpi/hmat: " fmt
> +#define dev_fmt(fmt) "acpi/hmat: " fmt
> +
>  #include <linux/acpi.h>
>  #include <linux/bitops.h>
>  #include <linux/device.h>
>  #include <linux/init.h>
>  #include <linux/list.h>
> +#include <linux/mm.h>
> +#include <linux/platform_device.h>
>  #include <linux/list_sort.h>
> +#include <linux/memregion.h>
>  #include <linux/memory.h>
>  #include <linux/mutex.h>
>  #include <linux/node.h>
> @@ -49,6 +55,7 @@ struct memory_target {
>         struct list_head node;
>         unsigned int memory_pxm;
>         unsigned int processor_pxm;
> +       struct resource memregions;
>         struct node_hmem_attrs hmem_attrs;
>         struct list_head caches;
>         struct node_cache_attrs cache_attrs;
> @@ -104,22 +111,36 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm)
>         list_add_tail(&initiator->node, &initiators);
>  }
>
> -static __init void alloc_memory_target(unsigned int mem_pxm)
> +static __init void alloc_memory_target(unsigned int mem_pxm,
> +               resource_size_t start, resource_size_t len)
>  {
>         struct memory_target *target;
>
>         target = find_mem_target(mem_pxm);
> -       if (target)
> -               return;
> -
> -       target = kzalloc(sizeof(*target), GFP_KERNEL);
> -       if (!target)
> -               return;
> +       if (!target) {
> +               target = kzalloc(sizeof(*target), GFP_KERNEL);
> +               if (!target)
> +                       return;
> +               target->memory_pxm = mem_pxm;
> +               target->processor_pxm = PXM_INVAL;
> +               target->memregions = (struct resource) {
> +                       .name   = "ACPI mem",
> +                       .start  = 0,
> +                       .end    = -1,
> +                       .flags  = IORESOURCE_MEM,
> +               };
> +               list_add_tail(&target->node, &targets);
> +               INIT_LIST_HEAD(&target->caches);
> +       }
>
> -       target->memory_pxm = mem_pxm;
> -       target->processor_pxm = PXM_INVAL;
> -       list_add_tail(&target->node, &targets);
> -       INIT_LIST_HEAD(&target->caches);
> +       /*
> +        * There are potentially multiple ranges per PXM, so record each
> +        * in the per-target memregions resource tree.
> +        */
> +       if (!__request_region(&target->memregions, start, len, "memory target",
> +                               IORESOURCE_MEM))
> +               pr_warn("failed to reserve %#llx - %#llx in pxm: %d\n",
> +                               start, start + len, mem_pxm);
>  }
>
>  static __init const char *hmat_data_type(u8 type)
> @@ -452,7 +473,7 @@ static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header,
>                 return -EINVAL;
>         if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
>                 return 0;
> -       alloc_memory_target(ma->proximity_domain);
> +       alloc_memory_target(ma->proximity_domain, ma->base_address, ma->length);
>         return 0;
>  }
>
> @@ -613,10 +634,91 @@ static void hmat_register_target_perf(struct memory_target *target)
>         node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0);
>  }
>
> +static void hmat_register_target_device(struct memory_target *target,
> +               struct resource *r)
> +{
> +       /* define a clean / non-busy resource for the platform device */
> +       struct resource res = {
> +               .start = r->start,
> +               .end = r->end,
> +               .flags = IORESOURCE_MEM,
> +       };
> +       struct platform_device *pdev;
> +       struct memregion_info info;
> +       int rc, id;
> +
> +       rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM,
> +                       IORES_DESC_SOFT_RESERVED);
> +       if (rc != REGION_INTERSECTS)
> +               return;
> +
> +       id = memregion_alloc(GFP_KERNEL);
> +       if (id < 0) {
> +               pr_err("memregion allocation failure for %pr\n", &res);
> +               return;
> +       }
> +
> +       pdev = platform_device_alloc("hmem", id);
> +       if (!pdev) {
> +               pr_err("hmem device allocation failure for %pr\n", &res);
> +               goto out_pdev;
> +       }
> +
> +       pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm);
> +       info = (struct memregion_info) {
> +               .target_node = acpi_map_pxm_to_node(target->memory_pxm),
> +       };
> +       rc = platform_device_add_data(pdev, &info, sizeof(info));
> +       if (rc < 0) {
> +               pr_err("hmem memregion_info allocation failure for %pr\n", &res);
> +               goto out_pdev;
> +       }
> +
> +       rc = platform_device_add_resources(pdev, &res, 1);
> +       if (rc < 0) {
> +               pr_err("hmem resource allocation failure for %pr\n", &res);
> +               goto out_resource;
> +       }
> +
> +       rc = platform_device_add(pdev);
> +       if (rc < 0) {
> +               dev_err(&pdev->dev, "device add failed for %pr\n", &res);
> +               goto out_resource;
> +       }
> +
> +       return;
> +
> +out_resource:
> +       put_device(&pdev->dev);
> +out_pdev:
> +       memregion_free(id);
> +}
> +
> +static __init void hmat_register_target_devices(struct memory_target *target)
> +{
> +       struct resource *res;
> +
> +       /*
> +        * Do not bother creating devices if no driver is available to
> +        * consume them.
> +        */
> +       if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
> +               return;
> +
> +       for (res = target->memregions.child; res; res = res->sibling)
> +               hmat_register_target_device(target, res);
> +}
> +
>  static void hmat_register_target(struct memory_target *target)
>  {
>         int nid = pxm_to_node(target->memory_pxm);
>
> +       /*
> +        * Devices may belong to either an offline or online
> +        * node, so unconditionally add them.
> +        */
> +       hmat_register_target_devices(target);
> +
>         /*
>          * Skip offline nodes. This can happen when memory
>          * marked EFI_MEMORY_SP, "specific purpose", is applied
> @@ -677,11 +779,21 @@ static __init void hmat_free_structures(void)
>         struct target_cache *tcache, *cnext;
>
>         list_for_each_entry_safe(target, tnext, &targets, node) {
> +               struct resource *res, *res_next;
> +
>                 list_for_each_entry_safe(tcache, cnext, &target->caches, node) {
>                         list_del(&tcache->node);
>                         kfree(tcache);
>                 }
> +
>                 list_del(&target->node);
> +               res = target->memregions.child;
> +               while (res) {
> +                       res_next = res->sibling;
> +                       __release_region(&target->memregions, res->start,
> +                                       resource_size(res));
> +                       res = res_next;
> +               }
>                 kfree(target);
>         }
>
>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux