The aim of such policy is to prevent a user task from creating an executable mapping that is also writeable. An example of mmap() returning -EACCESS if the policy is enabled: mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0); Similarly, mprotect() would return -EACCESS below: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC); The BPF filter that systemd MDWE uses is stateless, and disallows mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to be enabled if it was already PROT_EXEC, which allows the following case: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI); where PROT_BTI enables branch tracking identification on arm64. Signed-off-by: Joey Gouly <joey.gouly@xxxxxxx> Co-developed-by: Catalin Marinas <catalin.marinas@xxxxxxx> Signed-off-by: Catalin Marinas <catalin.marinas@xxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/mman.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/sched/coredump.h | 6 +++++- include/uapi/linux/prctl.h | 6 ++++++ kernel/sys.c | 33 +++++++++++++++++++++++++++++++++ mm/mmap.c | 10 ++++++++++ mm/mprotect.c | 5 +++++ 6 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index 58b3abd457a3..cee1e4b566d8 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags) } unsigned long vm_commit_limit(void); + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + */ +static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) +{ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) + return true; + + if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) + return true; + + return false; +} + #endif /* _LINUX_MMAN_H */ diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 8270ad7ae14c..0e17ae7fbfd3 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm) * lifecycle of this mm, just for simplicity. */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK) + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index a5e06dcbba13..1312a137f7fb 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -281,6 +281,12 @@ struct prctl_mm_map { # define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN 1 + +#define PR_GET_MDWE 66 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/kernel/sys.c b/kernel/sys.c index 5fd54bf0e886..b3cab94545ed 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + return -EINVAL; + + if (bits & PR_MDWE_REFUSE_EXEC_GAIN) + set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return -EPERM; /* Cannot unset the flag */ + + return 0; +} + +static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? + PR_MDWE_REFUSE_EXEC_GAIN : 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_MDWE: + error = prctl_set_mdwe(arg2, arg3, arg4, arg5); + break; + case PR_GET_MDWE: + error = prctl_get_mdwe(arg2, arg3, arg4, arg5); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/mm/mmap.c b/mm/mmap.c index 87d929316d57..99a4d9e2b0d8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2665,6 +2665,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_anonymous(vma); } + if (map_deny_write_exec(vma, vma->vm_flags)) { + error = -EACCES; + if (file) + goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; + else + goto free_vma; + } + /* Allow architectures to sanity-check the vm_flags */ if (!arch_validate_flags(vma->vm_flags)) { error = -EINVAL; diff --git a/mm/mprotect.c b/mm/mprotect.c index 908df12caa26..bc0587df042f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -762,6 +762,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } + if (map_deny_write_exec(vma, newflags)) { + error = -EACCES; + goto out; + } + /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; -- 2.17.1