The patch titled Subject: execmem: add API for temporal remapping as RW and restoring ROX afterwards has been added to the -mm mm-unstable branch. Its filename is execmem-add-api-for-temporal-remapping-as-rw-and-restoring-rox-afterwards.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/execmem-add-api-for-temporal-remapping-as-rw-and-restoring-rox-afterwards.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: "Mike Rapoport (Microsoft)" <rppt@xxxxxxxxxx> Subject: execmem: add API for temporal remapping as RW and restoring ROX afterwards Date: Fri, 27 Dec 2024 09:28:21 +0200 Using a writable copy for ROX memory is cumbersome and error prone. Add API that allow temporarily remapping of ranges in the ROX cache as writable and then restoring their read-only-execute permissions. This API will be later used in modules code and will allow removing nasty games with writable copy in alternatives patching on x86. The restoring of the ROX permissions relies on the ability of architecture to reconstruct large pages in its set_memory_rox() method. Link: https://lkml.kernel.org/r/20241227072825.1288491-5-rppt@xxxxxxxxxx Signed-off-by: Mike Rapoport (Microsoft) <rppt@xxxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Anton Ivanov <anton.ivanov@xxxxxxxxxxxxxxxxxx> Cc: Borislav Petkov <bp@xxxxxxxxx> Cc: Brendan Higgins <brendan.higgins@xxxxxxxxx> Cc: Daniel Gomez <da.gomez@xxxxxxxxxxx> Cc: Daniel Thompson <danielt@xxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Cc: David Gow <davidgow@xxxxxxxxxx> Cc: Douglas Anderson <dianders@xxxxxxxxxxxx> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Jason Wessel <jason.wessel@xxxxxxxxxxxxx> Cc: Jiri Kosina <jikos@xxxxxxxxxx> Cc: Joe Lawrence <joe.lawrence@xxxxxxxxxx> Cc: Johannes Berg <johannes@xxxxxxxxxxxxxxxx> Cc: Josh Poimboeuf <jpoimboe@xxxxxxxxxx> Cc: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Cc: Luis Chamberlain <mcgrof@xxxxxxxxxx> Cc: Mark Rutland <mark.rutland@xxxxxxx> Cc: Masami Hiramatsu <mhiramat@xxxxxxxxxx> Cc: Miroslav Benes <mbenes@xxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Petr Mladek <pmladek@xxxxxxxx> Cc: Petr Pavlu <petr.pavlu@xxxxxxxx> Cc: Rae Moar <rmoar@xxxxxxxxxx> Cc: Richard Weinberger <richard@xxxxxx> Cc: Sami Tolvanen <samitolvanen@xxxxxxxxxx> Cc: Shuah Khan <shuah@xxxxxxxxxx> Cc: Song Liu <song@xxxxxxxxxx> Cc: Steven Rostedt <rostedt@xxxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/execmem.h | 31 +++++++++ mm/execmem.c | 118 +++++++++++++++++++++++++++++++------- 2 files changed, 130 insertions(+), 19 deletions(-) --- a/include/linux/execmem.h~execmem-add-api-for-temporal-remapping-as-rw-and-restoring-rox-afterwards +++ a/include/linux/execmem.h @@ -65,6 +65,37 @@ enum execmem_range_flags { * Architectures that use EXECMEM_ROX_CACHE must implement this. */ void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); + +/** + * execmem_make_temp_rw - temporarily remap region with read-write + * permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Remaps a part of the cached large page in the ROX cache in the range + * [@ptr, @ptr + @size) as writable and not executable. The caller must + * have exclusive ownership of this range and ensure nothing will try to + * execute code in this range. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_make_temp_rw(void *ptr, size_t size); + +/** + * execmem_restore_rox - restore read-only-execute permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Restores read-only-execute permissions on a range [@ptr, @ptr + @size) + * after it was temporarily remapped as writable. Relies on architecture + * implementation of set_memory_rox() to restore mapping using large pages. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_restore_rox(void *ptr, size_t size); +#else +static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } +static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif /** --- a/mm/execmem.c~execmem-add-api-for-temporal-remapping-as-rw-and-restoring-rox-afterwards +++ a/mm/execmem.c @@ -89,6 +89,12 @@ static void *execmem_vmalloc(struct exec #endif /* CONFIG_MMU */ #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +struct execmem_area { + struct vm_struct *vm; + unsigned int rw_mappings; + size_t size; +}; + struct execmem_cache { struct mutex mutex; struct maple_tree busy_areas; @@ -135,7 +141,7 @@ static void execmem_cache_clean(struct w struct maple_tree *free_areas = &execmem_cache.free_areas; struct mutex *mutex = &execmem_cache.mutex; MA_STATE(mas, free_areas, 0, ULONG_MAX); - void *area; + struct execmem_area *area; mutex_lock(mutex); mas_for_each(&mas, area, ULONG_MAX) { @@ -143,11 +149,12 @@ static void execmem_cache_clean(struct w if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(mas.index, PMD_SIZE)) { - struct vm_struct *vm = find_vm_area(area); + struct vm_struct *vm = area->vm; execmem_set_direct_map_valid(vm, true); mas_store_gfp(&mas, NULL, GFP_KERNEL); - vfree(area); + vfree(vm->addr); + kfree(area); } } mutex_unlock(mutex); @@ -155,30 +162,31 @@ static void execmem_cache_clean(struct w static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); -static int execmem_cache_add(void *ptr, size_t size) +static int execmem_cache_add(void *ptr, size_t size, struct execmem_area *area) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, free_areas, addr - 1, addr + 1); + struct execmem_area *lower_area = NULL; + struct execmem_area *upper_area = NULL; unsigned long lower, upper; - void *area = NULL; int err; lower = addr; upper = addr + size - 1; mutex_lock(mutex); - area = mas_walk(&mas); - if (area && mas.last == addr - 1) + lower_area = mas_walk(&mas); + if (lower_area && lower_area == area && mas.last == addr - 1) lower = mas.index; - area = mas_next(&mas, ULONG_MAX); - if (area && mas.index == addr + size) + upper_area = mas_next(&mas, ULONG_MAX); + if (upper_area && upper_area == area && mas.index == addr + size) upper = mas.last; mas_set_range(&mas, lower, upper); - err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); + err = mas_store_gfp(&mas, area, GFP_KERNEL); mutex_unlock(mutex); if (err) return err; @@ -209,7 +217,8 @@ static void *__execmem_cache_alloc(struc MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); struct mutex *mutex = &execmem_cache.mutex; unsigned long addr, last, area_size = 0; - void *area, *ptr = NULL; + struct execmem_area *area; + void *ptr = NULL; int err; mutex_lock(mutex); @@ -228,20 +237,18 @@ static void *__execmem_cache_alloc(struc /* insert allocated size to busy_areas at range [addr, addr + size) */ mas_set_range(&mas_busy, addr, addr + size - 1); - err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); + err = mas_store_gfp(&mas_busy, area, GFP_KERNEL); if (err) goto out_unlock; mas_store_gfp(&mas_free, NULL, GFP_KERNEL); if (area_size > size) { - void *ptr = (void *)(addr + size); - /* * re-insert remaining free size to free_areas at range * [addr + size, last] */ mas_set_range(&mas_free, addr + size, last); - err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); + err = mas_store_gfp(&mas_free, area, GFP_KERNEL); if (err) { mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); goto out_unlock; @@ -257,16 +264,21 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + struct execmem_area *area; unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; void *p; + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return err; + alloc_size = round_up(size, PMD_SIZE); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); if (!p) - return err; + goto err_free_area; vm = find_vm_area(p); if (!vm) @@ -289,7 +301,9 @@ static int execmem_cache_populate(struct if (err) goto err_free_mem; - err = execmem_cache_add(p, alloc_size); + area->size = alloc_size; + area->vm = vm; + err = execmem_cache_add(p, alloc_size, area); if (err) goto err_free_mem; @@ -297,6 +311,8 @@ static int execmem_cache_populate(struct err_free_mem: vfree(p); +err_free_area: + kfree(area); return err; } @@ -305,6 +321,9 @@ static void *execmem_cache_alloc(struct void *p; int err; + /* make sure everything in the cache is page aligned */ + size = PAGE_ALIGN(size); + p = __execmem_cache_alloc(range, size); if (p) return p; @@ -322,8 +341,8 @@ static bool execmem_cache_free(void *ptr struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, busy_areas, addr, addr); + struct execmem_area *area; size_t size; - void *area; mutex_lock(mutex); area = mas_walk(&mas); @@ -338,12 +357,73 @@ static bool execmem_cache_free(void *ptr execmem_fill_trapping_insns(ptr, size, /* writable = */ false); - execmem_cache_add(ptr, size); + execmem_cache_add(ptr, size, area); schedule_work(&execmem_cache_clean_work); return true; } + +int execmem_make_temp_rw(void *ptr, size_t size) +{ + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, busy_areas, addr, addr); + struct execmem_area *area; + int ret = -ENOMEM; + + mutex_lock(mutex); + area = mas_walk(&mas); + if (!area) + goto out; + + ret = set_memory_nx(addr, nr); + if (ret) + goto out; + + /* + * If a split of large page was required, it already happened when + * we marked the pages NX which guarantees that this call won't + * fail + */ + set_memory_rw(addr, nr); + area->rw_mappings++; + +out: + mutex_unlock(mutex); + return ret; +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, busy_areas, addr, addr); + struct execmem_area *area; + int err = 0; + + size = PAGE_ALIGN(size); + + mutex_lock(mutex); + mas_for_each(&mas, area, addr + size - 1) { + area->rw_mappings--; + if (!area->rw_mappings) { + unsigned int nr = area->size >> PAGE_SHIFT; + + addr = (unsigned long)area->vm->addr; + err = set_memory_rox(addr, nr); + if (err) + break; + } + } + mutex_unlock(mutex); + + return err; +} + #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { _ Patches currently in -mm which might be from rppt@xxxxxxxxxx are x86-mm-pat-cpa-test-fix-length-for-cpa_array-test.patch x86-mm-pat-drop-duplicate-variable-in-cpa_flush.patch execmem-add-api-for-temporal-remapping-as-rw-and-restoring-rox-afterwards.patch module-introduce-module_state_gone.patch modules-switch-to-execmem-api-for-remapping-as-rw-and-restoring-rox.patch revert-x86-module-prepare-module-loading-for-rox-allocations-of-text.patch module-drop-unused-module_writable_address.patch