snip . . .
+ +config PROTECTABLE_MEMORY + bool + depends on MMU
Curious, would you also want to depend on "SECURITY" as well, as this is being advertised as a compliment to __read_only_after_init, per the file header comments, as I'm assuming ro_after_init would be disabled if the SECURITY Kconfig selection is *NOT* selected?
+ depends on ARCH_HAS_SET_MEMORY + select GENERIC_ALLOCATOR + default y diff --git a/mm/Makefile b/mm/Makefile index e669f02c5a54..959fdbdac118 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_PROTECTABLE_MEMORY) += pmalloc.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o diff --git a/mm/pmalloc.c b/mm/pmalloc.c new file mode 100644 index 000000000000..acdec0fbdde6 --- /dev/null +++ b/mm/pmalloc.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * pmalloc.c: Protectable Memory Allocator + * + * (C) Copyright 2017 Huawei Technologies Co. Ltd. + * Author: Igor Stoppa<igor.stoppa@xxxxxxxxxx> + */ + +#include <linux/printk.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/genalloc.h> +#include <linux/kernel.h> +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/device.h> +#include <linux/atomic.h> +#include <linux/rculist.h> +#include <linux/set_memory.h> +#include <linux/bug.h> +#include <asm/cacheflush.h> +#include <asm/page.h> + +#include <linux/pmalloc.h> +/* + * pmalloc_data contains the data specific to a pmalloc pool, + * in a format compatible with the design of gen_alloc. + * Some of the fields are used for exposing the corresponding parameter + * to userspace, through sysfs. + */ +struct pmalloc_data { + struct gen_pool *pool; /* Link back to the associated pool. */ + bool protected; /* Status of the pool: RO or RW. */
nitpick, you could probably get a tad bit better byte packing alignment of this struct if "bool protected" was stuck as the last element in this data structure.
+ struct kobj_attribute attr_protected; /* Sysfs attribute. */ + struct kobj_attribute attr_avail; /* Sysfs attribute. */ + struct kobj_attribute attr_size; /* Sysfs attribute. */ + struct kobj_attribute attr_chunks; /* Sysfs attribute. */ + struct kobject *pool_kobject; + struct list_head node; /* list of pools */ +}; + +static LIST_HEAD(pmalloc_final_list); +static LIST_HEAD(pmalloc_tmp_list); +static struct list_head *pmalloc_list = &pmalloc_tmp_list; +static DEFINE_MUTEX(pmalloc_mutex); +static struct kobject *pmalloc_kobject; + +static ssize_t pmalloc_pool_show_protected(struct kobject *dev, + struct kobj_attribute *attr, + char *buf) +{ + struct pmalloc_data *data; + + data = container_of(attr, struct pmalloc_data, attr_protected); + if (data->protected) + return sprintf(buf, "protected\n"); + else + return sprintf(buf, "unprotected\n"); +} + +static ssize_t pmalloc_pool_show_avail(struct kobject *dev, + struct kobj_attribute *attr, + char *buf) +{ + struct pmalloc_data *data; + + data = container_of(attr, struct pmalloc_data, attr_avail); + return sprintf(buf, "%lu\n", + (unsigned long)gen_pool_avail(data->pool)); +} + +static ssize_t pmalloc_pool_show_size(struct kobject *dev, + struct kobj_attribute *attr, + char *buf) +{ + struct pmalloc_data *data; + + data = container_of(attr, struct pmalloc_data, attr_size); + return sprintf(buf, "%lu\n", + (unsigned long)gen_pool_size(data->pool)); +}
Curious, will this show the size in bytes?
+ +static void pool_chunk_number(struct gen_pool *pool, + struct gen_pool_chunk *chunk, void *data) +{ + unsigned long *counter = data; + + (*counter)++; +} + +static ssize_t pmalloc_pool_show_chunks(struct kobject *dev, + struct kobj_attribute *attr, + char *buf) +{ + struct pmalloc_data *data; + unsigned long chunks_num = 0; + + data = container_of(attr, struct pmalloc_data, attr_chunks); + gen_pool_for_each_chunk(data->pool, pool_chunk_number, &chunks_num); + return sprintf(buf, "%lu\n", chunks_num); +} + +/* Exposes the pool and its attributes through sysfs. */ +static struct kobject *pmalloc_connect(struct pmalloc_data *data) +{ + const struct attribute *attrs[] = { + &data->attr_protected.attr, + &data->attr_avail.attr, + &data->attr_size.attr, + &data->attr_chunks.attr, + NULL + }; + struct kobject *kobj; + + kobj = kobject_create_and_add(data->pool->name, pmalloc_kobject); + if (unlikely(!kobj)) + return NULL; + + if (unlikely(sysfs_create_files(kobj, attrs) < 0)) { + kobject_put(kobj); + kobj = NULL; + } + return kobj; +} + +/* Removes the pool and its attributes from sysfs. */ +static void pmalloc_disconnect(struct pmalloc_data *data, + struct kobject *kobj) +{ + const struct attribute *attrs[] = { + &data->attr_protected.attr, + &data->attr_avail.attr, + &data->attr_size.attr, + &data->attr_chunks.attr, + NULL + }; + + sysfs_remove_files(kobj, attrs); + kobject_put(kobj); +} + +/* Declares an attribute of the pool. */ +#define pmalloc_attr_init(data, attr_name) \ +do { \ + sysfs_attr_init(&data->attr_##attr_name.attr); \ + data->attr_##attr_name.attr.name = #attr_name; \ + data->attr_##attr_name.attr.mode = VERIFY_OCTAL_PERMISSIONS(0400); \ + data->attr_##attr_name.show = pmalloc_pool_show_##attr_name; \
Why are these ##attr's being used and not the #define macros already in the kernel (DEVICE_ATTR(), DEVICE_ATTR_RO(), etc found in /include/linux/device.h)? Those macros are much easier to read and use.
+} while (0) + +struct gen_pool *pmalloc_create_pool(const char *name, int min_alloc_order) +{ + struct gen_pool *pool; + const char *pool_name; + struct pmalloc_data *data; + + if (unlikely(!name)) { + WARN(true, "unnamed pool"); + return NULL; + } + + if (min_alloc_order < 0) + min_alloc_order = ilog2(sizeof(unsigned long)); + + pool = gen_pool_create(min_alloc_order, NUMA_NO_NODE); + if (unlikely(!pool)) + return NULL; + + mutex_lock(&pmalloc_mutex); + list_for_each_entry(data, pmalloc_list, node) + if (!strcmp(name, data->pool->name)) + goto same_name_err; + + pool_name = kstrdup(name, GFP_KERNEL); + if (unlikely(!pool_name)) + goto name_alloc_err; + + data = kzalloc(sizeof(struct pmalloc_data), GFP_KERNEL); + if (unlikely(!data)) + goto data_alloc_err; + + data->protected = false; + data->pool = pool; + pmalloc_attr_init(data, protected); + pmalloc_attr_init(data, avail); + pmalloc_attr_init(data, size); + pmalloc_attr_init(data, chunks); + pool->data = data; + pool->name = pool_name; + + list_add(&data->node, pmalloc_list); + if (pmalloc_list == &pmalloc_final_list) + data->pool_kobject = pmalloc_connect(data); + mutex_unlock(&pmalloc_mutex); + return pool; + +data_alloc_err: + kfree(pool_name); +name_alloc_err: +same_name_err: + mutex_unlock(&pmalloc_mutex); + gen_pool_destroy(pool); + return NULL; +} + +static inline bool chunk_tagging(void *chunk, bool tag) +{ + struct vm_struct *area; + struct page *page; + + if (!is_vmalloc_addr(chunk)) + return false; + + page = vmalloc_to_page(chunk); + if (unlikely(!page)) + return false; + + area = page->area; + if (tag) + area->flags |= VM_PMALLOC; + else + area->flags &= ~VM_PMALLOC; + return true; +} + + +static inline bool tag_chunk(void *chunk) +{ + return chunk_tagging(chunk, true); +} + + +static inline bool untag_chunk(void *chunk) +{ + return chunk_tagging(chunk, false); +} + +enum { + INVALID_PMALLOC_OBJECT = -1, + NOT_PMALLOC_OBJECT = 0, + VALID_PMALLOC_OBJECT = 1, +}; + +int is_pmalloc_object(const void *ptr, const unsigned long n) +{ + struct vm_struct *area; + struct page *page; + unsigned long area_start; + unsigned long area_end; + unsigned long object_start; + unsigned long object_end; + + + /* + * is_pmalloc_object gets called pretty late, so chances are high + * that the object is indeed of vmalloc type + */ + if (unlikely(!is_vmalloc_addr(ptr))) + return NOT_PMALLOC_OBJECT; + + page = vmalloc_to_page(ptr); + if (unlikely(!page)) + return NOT_PMALLOC_OBJECT; + + area = page->area; + + if (likely(!(area->flags & VM_PMALLOC))) + return NOT_PMALLOC_OBJECT; + + area_start = (unsigned long)area->addr; + area_end = area_start + area->nr_pages * PAGE_SIZE - 1; + object_start = (unsigned long)ptr; + object_end = object_start + n - 1; + + if (likely((area_start <= object_start) && + (object_end <= area_end))) + return VALID_PMALLOC_OBJECT; + else + return INVALID_PMALLOC_OBJECT; +} + + +bool pmalloc_prealloc(struct gen_pool *pool, size_t size) +{ + void *chunk; + size_t chunk_size; + bool add_error; + + /* Expand pool */ + chunk_size = roundup(size, PAGE_SIZE); + chunk = vmalloc(chunk_size); + if (unlikely(chunk == NULL)) + return false; + + /* Locking is already done inside gen_pool_add */ + add_error = gen_pool_add(pool, (unsigned long)chunk, chunk_size, + NUMA_NO_NODE); + if (unlikely(add_error != 0)) + goto abort; + + return true; +abort: + vfree_atomic(chunk); + return false; + +} + +void *pmalloc(struct gen_pool *pool, size_t size, gfp_t gfp) +{ + void *chunk; + size_t chunk_size; + bool add_error; + unsigned long retval; + + if (unlikely(((struct pmalloc_data *)(pool->data))->protected)) { + WARN(true, "pool %s is already protected", pool->name); + return NULL; + } + +retry_alloc_from_pool: + retval = gen_pool_alloc(pool, size); + if (retval) + goto return_allocation; + + if (unlikely((gfp & __GFP_ATOMIC))) { + if (unlikely((gfp & __GFP_NOFAIL))) + goto retry_alloc_from_pool; + else + return NULL; + } + + /* Expand pool */ + chunk_size = roundup(size, PAGE_SIZE); + chunk = vmalloc(chunk_size); + if (unlikely(!chunk)) { + if (unlikely((gfp & __GFP_NOFAIL))) + goto retry_alloc_from_pool; + else + return NULL; + } + if (unlikely(!tag_chunk(chunk))) + goto free; + + /* Locking is already done inside gen_pool_add */ + add_error = gen_pool_add(pool, (unsigned long)chunk, chunk_size, + NUMA_NO_NODE); + if (unlikely(add_error)) + goto abort; + + retval = gen_pool_alloc(pool, size); + if (retval) { +return_allocation: + *(size_t *)retval = size; + if (gfp & __GFP_ZERO) + memset((void *)retval, 0, size); + return (void *)retval; + } + /* + * Here there is no test for __GFP_NO_FAIL because, in case of + * concurrent allocation, one thread might add a chunk to the + * pool and this memory could be allocated by another thread, + * before the first thread gets a chance to use it. + * As long as vmalloc succeeds, it's ok to retry. + */ + goto retry_alloc_from_pool; +abort: + untag_chunk(chunk); +free: + vfree_atomic(chunk); + return NULL; +} + +static void pmalloc_chunk_set_protection(struct gen_pool *pool, + struct gen_pool_chunk *chunk, + void *data) +{ + const bool *flag = data; + size_t chunk_size = chunk->end_addr + 1 - chunk->start_addr; + unsigned long pages = chunk_size / PAGE_SIZE; + + if (unlikely(chunk_size & (PAGE_SIZE - 1))) { + WARN(true, "Chunk size is not a multiple of PAGE_SIZE."); + return; + } + + if (*flag) + set_memory_ro(chunk->start_addr, pages); + else + set_memory_rw(chunk->start_addr, pages); +} + +static int pmalloc_pool_set_protection(struct gen_pool *pool, bool protection) +{ + struct pmalloc_data *data; + struct gen_pool_chunk *chunk; + + data = pool->data; + if (unlikely(data->protected == protection)) { + WARN(true, "The pool %s is already protected as requested", + pool->name); + return 0; + } + data->protected = protection; + list_for_each_entry(chunk, &(pool)->chunks, next_chunk) + pmalloc_chunk_set_protection(pool, chunk, &protection); + return 0; +} + +int pmalloc_protect_pool(struct gen_pool *pool) +{ + return pmalloc_pool_set_protection(pool, true); +} + + +static void pmalloc_chunk_free(struct gen_pool *pool, + struct gen_pool_chunk *chunk, void *data) +{ + untag_chunk(chunk); + gen_pool_flush_chunk(pool, chunk); + vfree_atomic((void *)chunk->start_addr); +} + + +int pmalloc_destroy_pool(struct gen_pool *pool) +{ + struct pmalloc_data *data; + + data = pool->data; + + mutex_lock(&pmalloc_mutex); + list_del(&data->node); + mutex_unlock(&pmalloc_mutex); + + if (likely(data->pool_kobject)) + pmalloc_disconnect(data, data->pool_kobject); + + pmalloc_pool_set_protection(pool, false); + gen_pool_for_each_chunk(pool, pmalloc_chunk_free, NULL); + gen_pool_destroy(pool); + kfree(data); + return 0; +} + +/* + * When the sysfs is ready to receive registrations, connect all the + * pools previously created. Also enable further pools to be connected + * right away. + */ +static int __init pmalloc_late_init(void) +{ + struct pmalloc_data *data, *n; + + pmalloc_kobject = kobject_create_and_add("pmalloc", kernel_kobj); + + mutex_lock(&pmalloc_mutex); + pmalloc_list = &pmalloc_final_list; + + if (likely(pmalloc_kobject != NULL)) { + list_for_each_entry_safe(data, n, &pmalloc_tmp_list, node) { + list_move(&data->node, &pmalloc_final_list); + pmalloc_connect(data); + } + } + mutex_unlock(&pmalloc_mutex); + return 0;
I'd just go ahead and return a different value if pmalloc_kobject does equal to NULL. __init is already returning a value, and the __init is already checking an error case for failure, might as well go all the way and stick a little icing on the cake and return a different (errno) value for this case.
Thanks, Jay -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>