On 03.08.20 07:03, Dan Williams wrote: > Several related issues around this unneeded attribute: > > - The dax_kmem_res property allows the kmem driver to stash the adjusted > resource range that was used for the hotplug operation, but that can be > recalculated from the original base range. > > - kmem is using an open coded release_resource() + kfree() when an > idiomatic release_mem_region() is sufficient. > > - The driver managed resource need only manage the busy flag. Other flags > are of no concern to the kmem driver. In fact if kmem inherits some > memory range that add_memory_driver_managed() rejects that is a > memory-hotplug-core policy that the driver is in no position to > override. > > - The implementation trusts that failed remove_memory() results in the > entire resource range remaining pinned busy. The driver need not make > that layering violation assumption and just maintain the busy state in > its local resource. > > - The "Hot-remove not yet implemented." comment is stale since hotremove > support is now included. I think some of these changes could have been nicely split out to simplify reviewing. E.g., comment update, release_mem_region(), &= ~IORESOURCE_BUSY ... [...] > + > int dev_dax_kmem_probe(struct device *dev) > { > struct dev_dax *dev_dax = to_dev_dax(dev); > - struct range *range = &dev_dax->range; > - resource_size_t kmem_start; > - resource_size_t kmem_size; > - resource_size_t kmem_end; > - struct resource *new_res; > - const char *new_res_name; > - int numa_node; > + struct range range = dax_kmem_range(dev_dax); > + int numa_node = dev_dax->target_node; > + struct resource *res; > + char *res_name; > int rc; > > /* > @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev) > * could be mixed in a node with faster memory, causing > * unavoidable performance issues. > */ > - numa_node = dev_dax->target_node; > if (numa_node < 0) { > dev_warn(dev, "rejecting DAX region with invalid node: %d\n", > numa_node); > return -EINVAL; > } > > - /* Hotplug starting at the beginning of the next block: */ > - kmem_start = ALIGN(range->start, memory_block_size_bytes()); > - > - kmem_size = range_len(range); > - /* Adjust the size down to compensate for moving up kmem_start: */ > - kmem_size -= kmem_start - range->start; > - /* Align the size down to cover only complete blocks: */ > - kmem_size &= ~(memory_block_size_bytes() - 1); > - kmem_end = kmem_start + kmem_size; > - > - new_res_name = kstrdup(dev_name(dev), GFP_KERNEL); > - if (!new_res_name) > + res_name = kstrdup(dev_name(dev), GFP_KERNEL); > + if (!res_name) > return -ENOMEM; > > - /* Region is permanently reserved if hotremove fails. */ > - new_res = request_mem_region(kmem_start, kmem_size, new_res_name); > - if (!new_res) { > - dev_warn(dev, "could not reserve region [%pa-%pa]\n", > - &kmem_start, &kmem_end); > - kfree(new_res_name); > + res = request_mem_region(range.start, range_len(&range), res_name); I think our range could be empty after aligning. I assume request_mem_region() would check that, but maybe we could report a better error/warning in that case. > + if (!res) { > + dev_warn(dev, "could not reserve region [%#llx-%#llx]\n", > + range.start, range.end); > + kfree(res_name); > return -EBUSY; > } > > /* > - * Set flags appropriate for System RAM. Leave ..._BUSY clear > - * so that add_memory() can add a child resource. Do not > - * inherit flags from the parent since it may set new flags > - * unknown to us that will break add_memory() below. > + * Temporarily clear busy to allow add_memory_driver_managed() > + * to claim it. > */ > - new_res->flags = IORESOURCE_SYSTEM_RAM; > + res->flags &= ~IORESOURCE_BUSY; Right, same approach is taken by virtio-mem. > > /* > * Ensure that future kexec'd kernels will not treat this as RAM > * automatically. > */ > - rc = add_memory_driver_managed(numa_node, new_res->start, > - resource_size(new_res), kmem_name); > + rc = add_memory_driver_managed(numa_node, res->start, > + resource_size(res), kmem_name); > + > + res->flags |= IORESOURCE_BUSY; Hm, I don't think that's correct. Any specific reason why to mark the not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could suddenly stumble over it - and e.g., similarly kexec code when trying to find memory for placing kexec images. I think we should leave this !BUSY, just as it is right now. > if (rc) { > - release_resource(new_res); > - kfree(new_res); > - kfree(new_res_name); > + release_mem_region(range.start, range_len(&range)); > + kfree(res_name); > return rc; > } > - dev_dax->dax_kmem_res = new_res; > + > + dev_set_drvdata(dev, res_name); > > return 0; > } > > #ifdef CONFIG_MEMORY_HOTREMOVE > -static int dev_dax_kmem_remove(struct device *dev) > +static void dax_kmem_release(struct dev_dax *dev_dax) > { > - struct dev_dax *dev_dax = to_dev_dax(dev); > - struct resource *res = dev_dax->dax_kmem_res; > - resource_size_t kmem_start = res->start; > - resource_size_t kmem_size = resource_size(res); > - const char *res_name = res->name; > int rc; > + struct device *dev = &dev_dax->dev; > + const char *res_name = dev_get_drvdata(dev); > + struct range range = dax_kmem_range(dev_dax); > > /* > * We have one shot for removing memory, if some memory blocks were not > * offline prior to calling this function remove_memory() will fail, and > * there is no way to hotremove this memory until reboot because device > - * unbind will succeed even if we return failure. > + * unbind will proceed regardless of the remove_memory result. > */ > - rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size); > - if (rc) { > - any_hotremove_failed = true; > - dev_err(dev, > - "DAX region %pR cannot be hotremoved until the next reboot\n", > - res); > - return rc; > + rc = remove_memory(dev_dax->target_node, range.start, range_len(&range)); > + if (rc == 0) { if (!rc) ? > + release_mem_region(range.start, range_len(&range)); remove_memory() does a release_mem_region_adjustable(). Don't you actually want to release the *unaligned* region you requested? > + dev_set_drvdata(dev, NULL); > + kfree(res_name); > + return; > } Not sure if inverting the error handling improved the code / review here. > > - /* Release and free dax resources */ > - release_resource(res); > - kfree(res); > - kfree(res_name); > - dev_dax->dax_kmem_res = NULL; > - > - return 0; > + any_hotremove_failed = true; > + dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n", > + range.start, range.end); > } > #else > -static int dev_dax_kmem_remove(struct device *dev) > +static void dax_kmem_release(struct dev_dax *dev_dax) > { > /* > - * Without hotremove purposely leak the request_mem_region() for the > - * device-dax range and return '0' to ->remove() attempts. The removal > - * of the device from the driver always succeeds, but the region is > - * permanently pinned as reserved by the unreleased > - * request_mem_region(). > + * Without hotremove purposely leak the request_mem_region() for > + * the device-dax range attempts. The removal of the device from > + * the driver always succeeds, but the region is permanently > + * pinned as reserved by the unreleased request_mem_region(). > */ > any_hotremove_failed = true; > - return 0; > } > #endif /* CONFIG_MEMORY_HOTREMOVE */ > > +static int dev_dax_kmem_remove(struct device *dev) > +{ > + dax_kmem_release(to_dev_dax(dev)); > + return 0; > +} > + > static struct dax_device_driver device_dax_kmem_driver = { > .drv = { > .probe = dev_dax_kmem_probe, > Maybe split some of these changes out. Would at least help me to review ;) -- Thanks, David / dhildenb