This patch adds AT_NUMA_REPLICATION for execveat(). If this flag is set, the kernel will trigger COW(copy on write) on the mmapped ELF binary. So the program will have a copied-page on its NUMA node, even if the original page in page cache is on other NUMA nodes. Signed-off-by: Huang Shijie <shijie@xxxxxxxxxxxxxxxxxxxxxx> --- fs/binfmt_elf.c | 27 ++++++++++++++++++++++----- fs/exec.c | 5 ++++- include/linux/binfmts.h | 1 + include/linux/mm.h | 2 ++ include/uapi/linux/fcntl.h | 2 ++ mm/mprotect.c | 2 +- 6 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 439ed81e755a..fac8f4a4555a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, - unsigned long total_size) + unsigned long total_size, int numa_replication) { unsigned long map_addr; unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); addr = ELF_PAGESTART(addr); size = ELF_PAGEALIGN(size); + int ret; /* mmap() will return -EINVAL if given a zero size, but a * segment with zero filesize is perfectly valid */ @@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, */ if (total_size) { total_size = ELF_PAGEALIGN(total_size); - map_addr = vm_mmap(filep, addr, total_size, prot, type, off); + + if (numa_replication) { + /* Trigger the COW for this ELF code section */ + map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE, + type | MAP_POPULATE, off); + if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) { + /* Change back */ + ret = do_mprotect_pkey(map_addr, total_size, prot, -1); + if (ret) + return ret; + } + } else { + map_addr = vm_mmap(filep, addr, total_size, prot, type, off); + } + if (!BAD_ADDR(map_addr)) vm_munmap(map_addr+size, total_size-size); - } else + } else { map_addr = vm_mmap(filep, addr, size, prot, type, off); + } if ((type & MAP_FIXED_NOREPLACE) && PTR_ERR((void *)map_addr) == -EEXIST) @@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, load_addr = -vaddr; map_addr = elf_map(interpreter, load_addr + vaddr, - eppnt, elf_prot, elf_type, total_size); + eppnt, elf_prot, elf_type, total_size, 0); total_size = 0; error = map_addr; if (BAD_ADDR(map_addr)) @@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm) } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, - elf_prot, elf_flags, total_size); + elf_prot, elf_flags, total_size, + bprm->support_numa_replication); if (BAD_ADDR(error)) { retval = IS_ERR((void *)error) ? PTR_ERR((void*)error) : -EINVAL; diff --git a/fs/exec.c b/fs/exec.c index 38f63451b928..d27efa540641 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) .lookup_flags = LOOKUP_FOLLOW, }; - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; @@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm, if (retval) goto out; + /* Do we support NUMA replication for this program? */ + bprm->support_numa_replication = flags & AT_NUMA_REPLICATION; + retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 049cf9421d83..1874e1732f20 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -64,6 +64,7 @@ struct linux_binprm { struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ char buf[BINPRM_BUF_SIZE]; + int support_numa_replication; } __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..76611381be2a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, #endif extern int sysctl_nr_trim_pages; +int do_mprotect_pkey(unsigned long start, size_t len, + unsigned long prot, int pkey); #ifdef CONFIG_PRINTK void mem_dump_obj(void *object); diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 2f86b2ad6d7e..de99c5ae8eca 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -111,4 +111,6 @@ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +#define AT_NUMA_REPLICATION 0x10000 /* Support NUMA replication for the ELF program */ + #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 883e2cc85cad..d1f8cececfed 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * pkey==-1 when doing a legacy mprotect() */ -static int do_mprotect_pkey(unsigned long start, size_t len, +int do_mprotect_pkey(unsigned long start, size_t len, unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; -- 2.30.2