From: Alexander Graf <graf@xxxxxxxxxx> Add the core infrastructure to generate Kexec HandOver metadata. Kexec HandOver is a mechanism that allows Linux to preserve state - arbitrary properties as well as memory locations - across kexec. It does so using 2 concepts: 1) State Tree - Every KHO kexec carries a state tree that describes the state of the system. The state tree is represented as hash-tables. Device drivers can add/remove their data into/from the state tree at system runtime. On kexec, the tree is converted to FDT (flattened device tree). 2) Scratch Regions - CMA regions that we allocate in the first kernel. CMA gives us the guarantee that no handover pages land in those regions, because handover pages must be at a static physical memory location. We use these regions as the place to load future kexec images so that they won't collide with any handover data. Signed-off-by: Alexander Graf <graf@xxxxxxxxxx> Co-developed-by: Pratyush Yadav <ptyadav@xxxxxxxxx> Signed-off-by: Pratyush Yadav <ptyadav@xxxxxxxxx> Co-developed-by: Mike Rapoport (Microsoft) <rppt@xxxxxxxxxx> Signed-off-by: Mike Rapoport (Microsoft) <rppt@xxxxxxxxxx> Co-developed-by: Changyuan Lyu <changyuanl@xxxxxxxxxx> Signed-off-by: Changyuan Lyu <changyuanl@xxxxxxxxxx> --- MAINTAINERS | 2 +- include/linux/kexec_handover.h | 109 +++++ kernel/Makefile | 1 + kernel/kexec_handover.c | 865 +++++++++++++++++++++++++++++++++ mm/mm_init.c | 8 + 5 files changed, 984 insertions(+), 1 deletion(-) create mode 100644 include/linux/kexec_handover.h create mode 100644 kernel/kexec_handover.c diff --git a/MAINTAINERS b/MAINTAINERS index 12852355bd66..a000a277ccf7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12828,7 +12828,7 @@ F: include/linux/kernfs.h KEXEC L: kexec@xxxxxxxxxxxxxxxxxxx W: http://kernel.org/pub/linux/utils/kernel/kexec/ -F: include/linux/kexec.h +F: include/linux/kexec*.h F: include/uapi/linux/kexec.h F: kernel/kexec* diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h new file mode 100644 index 000000000000..9cd9ad31e2d1 --- /dev/null +++ b/include/linux/kexec_handover.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_H +#define LINUX_KEXEC_HANDOVER_H + +#include <linux/types.h> +#include <linux/hashtable.h> +#include <linux/notifier.h> + +struct kho_scratch { + phys_addr_t addr; + phys_addr_t size; +}; + +/* KHO Notifier index */ +enum kho_event { + KEXEC_KHO_FINALIZE = 0, + KEXEC_KHO_UNFREEZE = 1, +}; + +#define KHO_HASHTABLE_BITS 3 +#define KHO_NODE_INIT \ + { \ + .props = HASHTABLE_INIT(KHO_HASHTABLE_BITS), \ + .nodes = HASHTABLE_INIT(KHO_HASHTABLE_BITS), \ + } + +struct kho_node { + struct hlist_node hlist; + + const char *name; + DECLARE_HASHTABLE(props, KHO_HASHTABLE_BITS); + DECLARE_HASHTABLE(nodes, KHO_HASHTABLE_BITS); + + struct list_head list; + bool visited; +}; + +#ifdef CONFIG_KEXEC_HANDOVER +bool kho_is_enabled(void); +void kho_init_node(struct kho_node *node); +int kho_add_node(struct kho_node *parent, const char *name, + struct kho_node *child); +struct kho_node *kho_remove_node(struct kho_node *parent, const char *name); +int kho_add_prop(struct kho_node *node, const char *key, const void *val, + u32 size); +void *kho_remove_prop(struct kho_node *node, const char *key, u32 *size); +int kho_add_string_prop(struct kho_node *node, const char *key, + const char *val); + +int register_kho_notifier(struct notifier_block *nb); +int unregister_kho_notifier(struct notifier_block *nb); + +void kho_memory_init(void); +#else +static inline bool kho_is_enabled(void) +{ + return false; +} + +static inline void kho_init_node(struct kho_node *node) +{ +} + +static inline int kho_add_node(struct kho_node *parent, const char *name, + struct kho_node *child) +{ + return -EOPNOTSUPP; +} + +static inline struct kho_node *kho_remove_node(struct kho_node *parent, + const char *name) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int kho_add_prop(struct kho_node *node, const char *key, + const void *val, u32 size) +{ + return -EOPNOTSUPP; +} + +static inline void *kho_remove_prop(struct kho_node *node, const char *key, + u32 *size) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int kho_add_string_prop(struct kho_node *node, const char *key, + const char *val) +{ + return -EOPNOTSUPP; +} + +static inline int register_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline int unregister_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline void kho_memory_init(void) +{ +} +#endif /* CONFIG_KEXEC_HANDOVER */ + +#endif /* LINUX_KEXEC_HANDOVER_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 87866b037fbe..cef5377c25cd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -75,6 +75,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c new file mode 100644 index 000000000000..df0d9debbb64 --- /dev/null +++ b/kernel/kexec_handover.c @@ -0,0 +1,865 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover.c - kexec handover metadata processing + * Copyright (C) 2023 Alexander Graf <graf@xxxxxxxxxx> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@xxxxxxxxxx> + * Copyright (C) 2024 Google LLC + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/cma.h> +#include <linux/kexec.h> +#include <linux/libfdt.h> +#include <linux/debugfs.h> +#include <linux/memblock.h> +#include <linux/notifier.h> +#include <linux/kexec_handover.h> +#include <linux/page-isolation.h> +#include <linux/rwsem.h> +#include <linux/xxhash.h> +/* + * KHO is tightly coupled with mm init and needs access to some of mm + * internal APIs. + */ +#include "../mm/internal.h" +#include "kexec_internal.h" + +static bool kho_enable __ro_after_init; + +bool kho_is_enabled(void) +{ + return kho_enable; +} +EXPORT_SYMBOL_GPL(kho_is_enabled); + +static int __init kho_parse_enable(char *p) +{ + return kstrtobool(p, &kho_enable); +} +early_param("kho", kho_parse_enable); + +/* + * With KHO enabled, memory can become fragmented because KHO regions may + * be anywhere in physical address space. The scratch regions give us a + * safe zones that we will never see KHO allocations from. This is where we + * can later safely load our new kexec images into and then use the scratch + * area for early allocations that happen before page allocator is + * initialized. + */ +static struct kho_scratch *kho_scratch; +static unsigned int kho_scratch_cnt; + +static struct dentry *debugfs_root; + +struct kho_out { + struct blocking_notifier_head chain_head; + + struct debugfs_blob_wrapper fdt_wrapper; + struct dentry *fdt_file; + struct dentry *dir; + + struct rw_semaphore tree_lock; + struct kho_node root; + + void *fdt; + u64 fdt_max; +}; + +static struct kho_out kho_out = { + .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), + .tree_lock = __RWSEM_INITIALIZER(kho_out.tree_lock), + .root = KHO_NODE_INIT, + .fdt_max = 10 * SZ_1M, +}; + +int register_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_kho_notifier); + +int unregister_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_kho_notifier); + +/* Helper functions for KHO state tree */ + +struct kho_prop { + struct hlist_node hlist; + + const char *key; + const void *val; + u32 size; +}; + +static unsigned long strhash(const char *s) +{ + return xxhash(s, strlen(s), 1120); +} + +void kho_init_node(struct kho_node *node) +{ + hash_init(node->props); + hash_init(node->nodes); +} +EXPORT_SYMBOL_GPL(kho_init_node); + +/** + * kho_add_node - add a child node to a parent node. + * @parent: parent node to add to. + * @name: name of the child node. + * @child: child node to be added to @parent with @name. + * + * If @parent is NULL, @child is added to KHO state tree root node. + * + * @child must be a valid pointer through KHO FDT finalization. + * @name is duplicated and thus can have a short lifetime. + * + * Callers must use their own locking if there are concurrent accesses to + * @parent or @child. + * + * Return: 0 on success, 1 if @child is already in @parent with @name, or + * - -EOPNOTSUPP: KHO is not enabled in the kernel command line, + * - -ENOMEM: failed to duplicate @name, + * - -EBUSY: KHO state tree has been converted to FDT, + * - -EEXIST: another node of the same name has been added to the parent. + */ +int kho_add_node(struct kho_node *parent, const char *name, + struct kho_node *child) +{ + unsigned long name_hash; + int ret = 0; + struct kho_node *node; + char *child_name; + + if (!kho_enable) + return -EOPNOTSUPP; + + if (!parent) + parent = &kho_out.root; + + child_name = kstrdup(name, GFP_KERNEL); + if (!child_name) + return -ENOMEM; + + name_hash = strhash(child_name); + + if (parent == &kho_out.root) + down_write(&kho_out.tree_lock); + else + down_read(&kho_out.tree_lock); + + if (kho_out.fdt) { + ret = -EBUSY; + goto out; + } + + hash_for_each_possible(parent->nodes, node, hlist, name_hash) { + if (!strcmp(node->name, child_name)) { + ret = node == child ? 1 : -EEXIST; + break; + } + } + + if (ret == 0) { + child->name = child_name; + hash_add(parent->nodes, &child->hlist, name_hash); + } + +out: + if (parent == &kho_out.root) + up_write(&kho_out.tree_lock); + else + up_read(&kho_out.tree_lock); + + if (ret) + kfree(child_name); + + return ret; +} +EXPORT_SYMBOL_GPL(kho_add_node); + +/** + * kho_remove_node - remove a child node from a parent node. + * @parent: parent node to look up for. + * @name: name of the child node. + * + * If @parent is NULL, KHO state tree root node is looked up. + * + * Callers must use their own locking if there are concurrent accesses to + * @parent or @child. + * + * Return: the pointer to the child node on success, or an error pointer, + * - -EOPNOTSUPP: KHO is not enabled in the kernel command line, + * - -ENOENT: no node named @name is found. + * - -EBUSY: KHO state tree has been converted to FDT. + */ +struct kho_node *kho_remove_node(struct kho_node *parent, const char *name) +{ + struct kho_node *child, *ret = ERR_PTR(-ENOENT); + unsigned long name_hash; + + if (!kho_enable) + return ERR_PTR(-EOPNOTSUPP); + + if (!parent) + parent = &kho_out.root; + + name_hash = strhash(name); + + if (parent == &kho_out.root) + down_write(&kho_out.tree_lock); + else + down_read(&kho_out.tree_lock); + + if (kho_out.fdt) { + ret = ERR_PTR(-EBUSY); + goto out; + } + + hash_for_each_possible(parent->nodes, child, hlist, name_hash) { + if (!strcmp(child->name, name)) { + ret = child; + break; + } + } + + if (!IS_ERR(ret)) { + hash_del(&ret->hlist); + kfree(ret->name); + ret->name = NULL; + } + +out: + if (parent == &kho_out.root) + up_write(&kho_out.tree_lock); + else + up_read(&kho_out.tree_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(kho_remove_node); + +/** + * kho_add_prop - add a property to a node. + * @node: KHO node to add the property to. + * @key: key of the property. + * @val: pointer to the property value. + * @size: size of the property value in bytes. + * + * @val and @key must be valid pointers through KHO FDT finalization. + * Generally @key is a string literal with static lifetime. + * + * Callers must use their own locking if there are concurrent accesses to @node. + * + * Return: 0 on success, 1 if the value is already added with @key, or + * - -ENOMEM: failed to allocate memory, + * - -EBUSY: KHO state tree has been converted to FDT, + * - -EEXIST: another property of the same key exists. + */ +int kho_add_prop(struct kho_node *node, const char *key, const void *val, + u32 size) +{ + unsigned long key_hash; + int ret = 0; + struct kho_prop *prop, *p; + + key_hash = strhash(key); + prop = kmalloc(sizeof(*prop), GFP_KERNEL); + if (!prop) + return -ENOMEM; + + prop->key = key; + prop->val = val; + prop->size = size; + + down_read(&kho_out.tree_lock); + if (kho_out.fdt) { + ret = -EBUSY; + goto out; + } + + hash_for_each_possible(node->props, p, hlist, key_hash) { + if (!strcmp(p->key, key)) { + ret = (p->val == val && p->size == size) ? 1 : -EEXIST; + break; + } + } + + if (!ret) + hash_add(node->props, &prop->hlist, key_hash); + +out: + up_read(&kho_out.tree_lock); + + if (ret) + kfree(prop); + + return ret; +} +EXPORT_SYMBOL_GPL(kho_add_prop); + +/** + * kho_add_string_prop - add a string property to a node. + * + * See kho_add_prop() for details. + */ +int kho_add_string_prop(struct kho_node *node, const char *key, const char *val) +{ + return kho_add_prop(node, key, val, strlen(val) + 1); +} +EXPORT_SYMBOL_GPL(kho_add_string_prop); + +/** + * kho_remove_prop - remove a property from a node. + * @node: KHO node to remove the property from. + * @key: key of the property. + * @size: if non-NULL, the property size is stored in it on success. + * + * Callers must use their own locking if there are concurrent accesses to @node. + * + * Return: the pointer to the property value, or + * - -EBUSY: KHO state tree has been converted to FDT, + * - -ENOENT: no property with @key is found. + */ +void *kho_remove_prop(struct kho_node *node, const char *key, u32 *size) +{ + struct kho_prop *p, *prop = NULL; + unsigned long key_hash; + void *ret = ERR_PTR(-ENOENT); + + key_hash = strhash(key); + + down_read(&kho_out.tree_lock); + + if (kho_out.fdt) { + ret = ERR_PTR(-EBUSY); + goto out; + } + + hash_for_each_possible(node->props, p, hlist, key_hash) { + if (!strcmp(p->key, key)) { + prop = p; + break; + } + } + + if (prop) { + ret = (void *)prop->val; + if (size) + *size = prop->size; + hash_del(&prop->hlist); + kfree(prop); + } + +out: + up_read(&kho_out.tree_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(kho_remove_prop); + +static int kho_out_update_debugfs_fdt(void) +{ + int err = 0; + + if (kho_out.fdt) { + kho_out.fdt_wrapper.data = kho_out.fdt; + kho_out.fdt_wrapper.size = fdt_totalsize(kho_out.fdt); + kho_out.fdt_file = debugfs_create_blob("fdt", 0400, kho_out.dir, + &kho_out.fdt_wrapper); + if (IS_ERR(kho_out.fdt_file)) + err = -ENOENT; + } else { + debugfs_remove(kho_out.fdt_file); + } + + return err; +} + +static int kho_unfreeze(void) +{ + int err; + void *fdt; + + down_write(&kho_out.tree_lock); + fdt = kho_out.fdt; + kho_out.fdt = NULL; + up_write(&kho_out.tree_lock); + + if (fdt) + kvfree(fdt); + + err = blocking_notifier_call_chain(&kho_out.chain_head, + KEXEC_KHO_UNFREEZE, NULL); + err = notifier_to_errno(err); + + return notifier_to_errno(err); +} + +static int kho_flatten_tree(void *fdt) +{ + int iter, err = 0; + struct kho_node *node, *sub_node; + struct list_head *ele; + struct kho_prop *prop; + LIST_HEAD(stack); + + kho_out.root.visited = false; + list_add(&kho_out.root.list, &stack); + + for (ele = stack.next; !list_is_head(ele, &stack); ele = stack.next) { + node = list_entry(ele, struct kho_node, list); + + if (node->visited) { + err = fdt_end_node(fdt); + if (err) + return err; + list_del_init(ele); + continue; + } + + err = fdt_begin_node(fdt, node->name); + if (err) + return err; + + hash_for_each(node->props, iter, prop, hlist) { + err = fdt_property(fdt, prop->key, prop->val, + prop->size); + if (err) + return err; + } + + hash_for_each(node->nodes, iter, sub_node, hlist) { + sub_node->visited = false; + list_add(&sub_node->list, &stack); + } + + node->visited = true; + } + + return 0; +} + +static int kho_convert_tree(void *buffer, int size) +{ + void *fdt = buffer; + int err = 0; + + err = fdt_create(fdt, size); + if (err) + goto out; + + err = fdt_finish_reservemap(fdt); + if (err) + goto out; + + err = kho_flatten_tree(fdt); + if (err) + goto out; + + err = fdt_finish(fdt); + if (err) + goto out; + + err = fdt_check_header(fdt); + if (err) + goto out; + +out: + if (err) { + pr_err("failed to flatten state tree: %d\n", err); + return -EINVAL; + } + return 0; +} + +static int kho_finalize(void) +{ + int err = 0; + void *fdt; + + fdt = kvmalloc(kho_out.fdt_max, GFP_KERNEL); + if (!fdt) + return -ENOMEM; + + err = blocking_notifier_call_chain(&kho_out.chain_head, + KEXEC_KHO_FINALIZE, NULL); + err = notifier_to_errno(err); + if (err) + goto unfreeze; + + down_write(&kho_out.tree_lock); + kho_out.fdt = fdt; + up_write(&kho_out.tree_lock); + + err = kho_convert_tree(fdt, kho_out.fdt_max); + +unfreeze: + if (err) { + int abort_err; + + pr_err("Failed to convert KHO state tree: %d\n", err); + + abort_err = kho_unfreeze(); + if (abort_err) + pr_err("Failed to abort KHO state tree: %d\n", + abort_err); + } + + return err; +} + +/* Handling for debug/kho/out */ +static int kho_out_finalize_get(void *data, u64 *val) +{ + *val = !!kho_out.fdt; + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 _val) +{ + int ret = 0; + bool val = !!_val; + + if (!kexec_trylock()) + return -EBUSY; + + if (val == !!kho_out.fdt) { + if (kho_out.fdt) + ret = -EEXIST; + else + ret = -ENOENT; + goto unlock; + } + + if (val) + ret = kho_finalize(); + else + ret = kho_unfreeze(); + + if (ret) + goto unlock; + + ret = kho_out_update_debugfs_fdt(); + +unlock: + kexec_unlock(); + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int kho_out_fdt_max_get(void *data, u64 *val) +{ + *val = kho_out.fdt_max; + + return 0; +} + +static int kho_out_fdt_max_set(void *data, u64 val) +{ + int ret = 0; + + if (!kexec_trylock()) { + ret = -EBUSY; + goto unlock; + } + + /* FDT already exists, it's too late to change fdt_max */ + if (kho_out.fdt) { + ret = -EBUSY; + goto unlock; + } + + kho_out.fdt_max = val; + +unlock: + kexec_unlock(); + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_fdt_max, kho_out_fdt_max_get, + kho_out_fdt_max_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +static __init int kho_out_debugfs_init(void) +{ + struct dentry *dir, *f; + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("fdt_max", 0600, dir, NULL, + &fops_kho_out_fdt_max); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &fops_kho_out_finalize); + if (IS_ERR(f)) + goto err_rmdir; + + kho_out.dir = dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +static __init int kho_init(void) +{ + int err; + + if (!kho_enable) + return 0; + + kho_out.root.name = ""; + err = kho_add_string_prop(&kho_out.root, "compatible", "kho-v1"); + if (err) + goto err_free_scratch; + + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) { + err = -ENOENT; + goto err_free_scratch; + } + + err = kho_out_debugfs_init(); + if (err) + goto err_free_scratch; + + for (int i = 0; i < kho_scratch_cnt; i++) { + unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); + unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; + unsigned long pfn; + + for (pfn = base_pfn; pfn < base_pfn + count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } + + return 0; + +err_free_scratch: + for (int i = 0; i < kho_scratch_cnt; i++) { + void *start = __va(kho_scratch[i].addr); + void *end = start + kho_scratch[i].size; + + free_reserved_area(start, end, -1, ""); + } + kho_enable = false; + return err; +} +late_initcall(kho_init); + +/* + * The scratch areas are scaled by default as percent of memory allocated from + * memblock. A user can override the scale with command line parameter: + * + * kho_scratch=N% + * + * It is also possible to explicitly define size for a lowmem, a global and + * per-node scratch areas: + * + * kho_scratch=l[KMG],n[KMG],m[KMG] + * + * The explicit size definition takes precedence over scale definition. + */ +static unsigned int scratch_scale __initdata = 200; +static phys_addr_t scratch_size_global __initdata; +static phys_addr_t scratch_size_pernode __initdata; +static phys_addr_t scratch_size_lowmem __initdata; + +static int __init kho_parse_scratch_size(char *p) +{ + unsigned long size, size_pernode, size_global; + char *endptr, *oldp = p; + + if (!p) + return -EINVAL; + + size = simple_strtoul(p, &endptr, 0); + if (*endptr == '%') { + scratch_scale = size; + pr_notice("scratch scale is %d percent\n", scratch_scale); + } else { + size = memparse(p, &p); + if (!size || p == oldp) + return -EINVAL; + + if (*p != ',') + return -EINVAL; + + oldp = p; + size_global = memparse(p + 1, &p); + if (!size_global || p == oldp) + return -EINVAL; + + if (*p != ',') + return -EINVAL; + + size_pernode = memparse(p + 1, &p); + if (!size_pernode) + return -EINVAL; + + scratch_size_lowmem = size; + scratch_size_global = size_global; + scratch_size_pernode = size_pernode; + scratch_scale = 0; + + pr_notice("scratch areas: lowmem: %lluMB global: %lluMB pernode: %lldMB\n", + (u64)(scratch_size_lowmem >> 20), + (u64)(scratch_size_global >> 20), + (u64)(scratch_size_pernode >> 20)); + } + + return 0; +} +early_param("kho_scratch", kho_parse_scratch_size); + +static void __init scratch_size_update(void) +{ + phys_addr_t size; + + if (!scratch_scale) + return; + + size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, + NUMA_NO_NODE); + size = size * scratch_scale / 100; + scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); + + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + NUMA_NO_NODE); + size = size * scratch_scale / 100 - scratch_size_lowmem; + scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +static phys_addr_t __init scratch_size_node(int nid) +{ + phys_addr_t size; + + if (scratch_scale) { + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + nid); + size = size * scratch_scale / 100; + } else { + size = scratch_size_pernode; + } + + return round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +/** + * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec + * + * With KHO we can preserve arbitrary pages in the system. To ensure we still + * have a large contiguous region of memory when we search the physical address + * space for target memory, let's make sure we always have a large CMA region + * active. This CMA region will only be used for movable pages which are not a + * problem for us during KHO because we can just move them somewhere else. + */ +static void __init kho_reserve_scratch(void) +{ + phys_addr_t addr, size; + int nid, i = 0; + + if (!kho_enable) + return; + + scratch_size_update(); + + /* FIXME: deal with node hot-plug/remove */ + kho_scratch_cnt = num_online_nodes() + 2; + size = kho_scratch_cnt * sizeof(*kho_scratch); + kho_scratch = memblock_alloc(size, PAGE_SIZE); + if (!kho_scratch) + goto err_disable_kho; + + /* + * reserve scratch area in low memory for lowmem allocations in the + * next kernel + */ + size = scratch_size_lowmem; + addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, + ARCH_LOW_ADDRESS_LIMIT); + if (!addr) + goto err_free_scratch_desc; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + /* reserve large contiguous area for allocations without nid */ + size = scratch_size_global; + addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + for_each_online_node(nid) { + size = scratch_size_node(nid); + addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, + 0, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, true); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + } + + return; + +err_free_scratch_areas: + for (i--; i >= 0; i--) + memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); +err_free_scratch_desc: + memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); +err_disable_kho: + kho_enable = false; +} + +void __init kho_memory_init(void) +{ + kho_reserve_scratch(); +} diff --git a/mm/mm_init.c b/mm/mm_init.c index 04441c258b05..757659b7a26b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,7 @@ #include <linux/crash_dump.h> #include <linux/execmem.h> #include <linux/vmstat.h> +#include <linux/kexec_handover.h> #include "internal.h" #include "slab.h" #include "shuffle.h" @@ -2661,6 +2662,13 @@ void __init mm_core_init(void) report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); + + /* + * KHO memory setup must happen while memblock is still active, but + * as close as possible to buddy initialization + */ + kho_memory_init(); + mem_init(); kmem_cache_init(); /* -- 2.48.1.711.g2feabab25a-goog