On Tue, Oct 22, 2024 at 05:34:50PM -0400, Gregory Price wrote: > Capacity is stranded when CFMWS regions are not aligned to block size. > On x86, block size increases with capacity (2G blocks @ 64G capacity). > > Use CFMWS base/size to report memory block size alignment advice. > > After the alignment, the acpi code begins populating numa nodes with > memblocks, so probe the value just prior to lock it in. All future > callers should be providing advice prior to this point. > > Suggested-by: Dan Williams <dan.j.williams@xxxxxxxxx> > Signed-off-by: Gregory Price <gourry@xxxxxxxxxx> > --- > drivers/acpi/numa/srat.c | 33 +++++++++++++++++++++++++++++++++ > 1 file changed, 33 insertions(+) > > diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c > index 44f91f2c6c5d..35e6f7c17f60 100644 > --- a/drivers/acpi/numa/srat.c > +++ b/drivers/acpi/numa/srat.c > @@ -14,6 +14,7 @@ > #include <linux/errno.h> > #include <linux/acpi.h> > #include <linux/memblock.h> > +#include <linux/memory.h> > #include <linux/numa.h> > #include <linux/nodemask.h> > #include <linux/topology.h> > @@ -333,6 +334,29 @@ acpi_parse_memory_affinity(union acpi_subtable_headers *header, > return 0; > } > > +/* Advise memblock on maximum block size to avoid stranded capacity. */ > +static int __init acpi_align_cfmws(union acpi_subtable_headers *header, > + void *arg, const unsigned long table_end) > +{ > + struct acpi_cedt_cfmws *cfmws = (struct acpi_cedt_cfmws *)header; > + u64 start = cfmws->base_hpa; > + u64 size = cfmws->window_size; > + unsigned long bz; Maybe unsigned long size? > + > + for (bz = SZ_64T; bz >= SZ_256M; bz >>= 1) { > + if (IS_ALIGNED(start, bz) && IS_ALIGNED(size, bz)) > + break; > + } > + > + if (bz >= SZ_256M) { > + if (memory_block_advise_max_size(bz) < 0) > + pr_warn("CFMWS: memblock size advise failed\n"); > + } else Nit: braces needed for else arm as well > + pr_err("CFMWS: [BIOS BUG] base/size alignment violates spec\n"); > + > + return 0; > +} > + > static int __init acpi_parse_cfmws(union acpi_subtable_headers *header, > void *arg, const unsigned long table_end) > { > @@ -545,6 +569,15 @@ int __init acpi_numa_init(void) > * Initialize a fake_pxm as the first available PXM to emulate. > */ > > + /* Align memblock size to CFMW regions if possible */ > + acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, acpi_align_cfmws, NULL); > + > + /* > + * Nodes start populating with blocks after this, so probe the max > + * block size to prevent it from changing in the future. > + */ > + memory_block_probe_max_size(); > + It won't change, but how drivers/base/memory.c will know about the probed size if architecture does not override memory_block_size_bytes()? > /* fake_pxm is the next unused PXM value after SRAT parsing */ > for (i = 0, fake_pxm = -1; i < MAX_NUMNODES; i++) { > if (node_to_pxm_map[i] > fake_pxm) > -- > 2.43.0 > -- Sincerely yours, Mike.