On Fri, 2023-02-10 at 01:07 -0800, Dan Williams wrote: > The default mode for device-dax instances is backwards for RAM-regions > as evidenced by the fact that it tends to catch end users by surprise. > "Where is my memory?". Recall that platforms are increasingly shipping > with performance-differentiated memory pools beyond typical DRAM and > NUMA effects. This includes HBM (high-bandwidth-memory) and CXL (dynamic > interleave, varied media types, and future fabric attached > possibilities). > > For this reason the EFI_MEMORY_SP (EFI Special Purpose Memory => Linux > 'Soft Reserved') attribute is expected to be applied to all memory-pools > that are not the general purpose pool. This designation gives an > Operating System a chance to defer usage of a memory pool until later in > the boot process where its performance properties can be interrogated > and administrator policy can be applied. > > 'Soft Reserved' memory can be anything from too limited and precious to > be part of the general purpose pool (HBM), too slow to host hot kernel > data structures (some PMEM media), or anything in between. However, in > the absence of an explicit policy, the memory should at least be made > usable by default. The current device-dax default hides all > non-general-purpose memory behind a device interface. > > The expectation is that the distribution of users that want the memory > online by default vs device-dedicated-access by default follows the > Pareto principle. A small number of enlightened users may want to do > userspace memory management through a device, but general users just > want the kernel to make the memory available with an option to get more > advanced later. > > Arrange for all device-dax instances not backed by PMEM to default to > attaching to the dax_kmem driver. From there the baseline memory hotplug > policy (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE / memhp_default_state=) > gates whether the memory comes online or stays offline. Where, if it > stays offline, it can be reliably converted back to device-mode where it > can be partitioned, or fronted by a userspace allocator. > > So, if someone wants device-dax instances for their 'Soft Reserved' > memory: > > 1/ Build a kernel with CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=n or boot > with memhp_default_state=offline, or roll the dice and hope that the > kernel has not pinned a page in that memory before step 2. > > 2/ Write a udev rule to convert the target dax device(s) from > 'system-ram' mode to 'devdax' mode: > > daxctl reconfigure-device $dax -m devdax -f > > Cc: Michal Hocko <mhocko@xxxxxxxx> > Cc: David Hildenbrand <david@xxxxxxxxxx> > Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Reviewed-by: Gregory Price <gregory.price@xxxxxxxxxxxx> > Tested-by: Fan Ni <fan.ni@xxxxxxxxxxx> > Link: https://lore.kernel.org/r/167564544513.847146.4645646177864365755.stgit@xxxxxxxxxxxxxxxxxxxxxxxxx > Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> > --- > drivers/dax/Kconfig | 2 +- > drivers/dax/bus.c | 53 ++++++++++++++++++++--------------------------- > drivers/dax/bus.h | 12 +++++++++-- > drivers/dax/device.c | 3 +-- > drivers/dax/hmem/hmem.c | 12 ++++++++++- > drivers/dax/kmem.c | 1 + > 6 files changed, 46 insertions(+), 37 deletions(-) Looks good, Reviewed-by: Vishal Verma <vishal.l.verma@xxxxxxxxx> > > diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig > index d13c889c2a64..1163eb62e5f6 100644 > --- a/drivers/dax/Kconfig > +++ b/drivers/dax/Kconfig > @@ -50,7 +50,7 @@ config DEV_DAX_HMEM_DEVICES > def_bool y > > config DEV_DAX_KMEM > - tristate "KMEM DAX: volatile-use of persistent memory" > + tristate "KMEM DAX: map dax-devices as System-RAM" > default DEV_DAX > depends on DEV_DAX > depends on MEMORY_HOTPLUG # for add_memory() and friends > diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c > index 1dad813ee4a6..012d576004e9 100644 > --- a/drivers/dax/bus.c > +++ b/drivers/dax/bus.c > @@ -56,6 +56,25 @@ static int dax_match_id(struct dax_device_driver *dax_drv, struct device *dev) > return match; > } > > +static int dax_match_type(struct dax_device_driver *dax_drv, struct device *dev) > +{ > + enum dax_driver_type type = DAXDRV_DEVICE_TYPE; > + struct dev_dax *dev_dax = to_dev_dax(dev); > + > + if (dev_dax->region->res.flags & IORESOURCE_DAX_KMEM) > + type = DAXDRV_KMEM_TYPE; > + > + if (dax_drv->type == type) > + return 1; > + > + /* default to device mode if dax_kmem is disabled */ > + if (dax_drv->type == DAXDRV_DEVICE_TYPE && > + !IS_ENABLED(CONFIG_DEV_DAX_KMEM)) > + return 1; > + > + return 0; > +} > + > enum id_action { > ID_REMOVE, > ID_ADD, > @@ -216,14 +235,9 @@ static int dax_bus_match(struct device *dev, struct device_driver *drv) > { > struct dax_device_driver *dax_drv = to_dax_drv(drv); > > - /* > - * All but the 'device-dax' driver, which has 'match_always' > - * set, requires an exact id match. > - */ > - if (dax_drv->match_always) > + if (dax_match_id(dax_drv, dev)) > return 1; > - > - return dax_match_id(dax_drv, dev); > + return dax_match_type(dax_drv, dev); > } > > /* > @@ -1413,13 +1427,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) > } > EXPORT_SYMBOL_GPL(devm_create_dev_dax); > > -static int match_always_count; > - > int __dax_driver_register(struct dax_device_driver *dax_drv, > struct module *module, const char *mod_name) > { > struct device_driver *drv = &dax_drv->drv; > - int rc = 0; > > /* > * dax_bus_probe() calls dax_drv->probe() unconditionally. > @@ -1434,26 +1445,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, > drv->mod_name = mod_name; > drv->bus = &dax_bus_type; > > - /* there can only be one default driver */ > - mutex_lock(&dax_bus_lock); > - match_always_count += dax_drv->match_always; > - if (match_always_count > 1) { > - match_always_count--; > - WARN_ON(1); > - rc = -EINVAL; > - } > - mutex_unlock(&dax_bus_lock); > - if (rc) > - return rc; > - > - rc = driver_register(drv); > - if (rc && dax_drv->match_always) { > - mutex_lock(&dax_bus_lock); > - match_always_count -= dax_drv->match_always; > - mutex_unlock(&dax_bus_lock); > - } > - > - return rc; > + return driver_register(drv); > } > EXPORT_SYMBOL_GPL(__dax_driver_register); > > @@ -1463,7 +1455,6 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv) > struct dax_id *dax_id, *_id; > > mutex_lock(&dax_bus_lock); > - match_always_count -= dax_drv->match_always; > list_for_each_entry_safe(dax_id, _id, &dax_drv->ids, list) { > list_del(&dax_id->list); > kfree(dax_id); > diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h > index fbb940293d6d..8cd79ab34292 100644 > --- a/drivers/dax/bus.h > +++ b/drivers/dax/bus.h > @@ -11,7 +11,10 @@ struct dax_device; > struct dax_region; > void dax_region_put(struct dax_region *dax_region); > > -#define IORESOURCE_DAX_STATIC (1UL << 0) > +/* dax bus specific ioresource flags */ > +#define IORESOURCE_DAX_STATIC BIT(0) > +#define IORESOURCE_DAX_KMEM BIT(1) > + > struct dax_region *alloc_dax_region(struct device *parent, int region_id, > struct range *range, int target_node, unsigned int align, > unsigned long flags); > @@ -25,10 +28,15 @@ struct dev_dax_data { > > struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); > > +enum dax_driver_type { > + DAXDRV_KMEM_TYPE, > + DAXDRV_DEVICE_TYPE, > +}; > + > struct dax_device_driver { > struct device_driver drv; > struct list_head ids; > - int match_always; > + enum dax_driver_type type; > int (*probe)(struct dev_dax *dev); > void (*remove)(struct dev_dax *dev); > }; > diff --git a/drivers/dax/device.c b/drivers/dax/device.c > index 5494d745ced5..ecdff79e31f2 100644 > --- a/drivers/dax/device.c > +++ b/drivers/dax/device.c > @@ -475,8 +475,7 @@ EXPORT_SYMBOL_GPL(dev_dax_probe); > > static struct dax_device_driver device_dax_driver = { > .probe = dev_dax_probe, > - /* all probe actions are unwound by devm, so .remove isn't necessary */ > - .match_always = 1, > + .type = DAXDRV_DEVICE_TYPE, > }; > > static int __init dax_init(void) > diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c > index e7bdff3132fa..5ec08f9f8a57 100644 > --- a/drivers/dax/hmem/hmem.c > +++ b/drivers/dax/hmem/hmem.c > @@ -11,15 +11,25 @@ module_param_named(region_idle, region_idle, bool, 0644); > > static int dax_hmem_probe(struct platform_device *pdev) > { > + unsigned long flags = IORESOURCE_DAX_KMEM; > struct device *dev = &pdev->dev; > struct dax_region *dax_region; > struct memregion_info *mri; > struct dev_dax_data data; > struct dev_dax *dev_dax; > > + /* > + * @region_idle == true indicates that an administrative agent > + * wants to manipulate the range partitioning before the devices > + * are created, so do not send them to the dax_kmem driver by > + * default. > + */ > + if (region_idle) > + flags = 0; > + > mri = dev->platform_data; > dax_region = alloc_dax_region(dev, pdev->id, &mri->range, > - mri->target_node, PMD_SIZE, 0); > + mri->target_node, PMD_SIZE, flags); > if (!dax_region) > return -ENOMEM; > > diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c > index 4852a2dbdb27..918d01d3fbaa 100644 > --- a/drivers/dax/kmem.c > +++ b/drivers/dax/kmem.c > @@ -239,6 +239,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) > static struct dax_device_driver device_dax_kmem_driver = { > .probe = dev_dax_kmem_probe, > .remove = dev_dax_kmem_remove, > + .type = DAXDRV_KMEM_TYPE, > }; > > static int __init dax_kmem_init(void) >