Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA replication

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 06.09.21 18:16, Huang Shijie wrote:
This patch adds AT_NUMA_REPLICATION for execveat().

If this flag is set, the kernel will trigger COW(copy on write)
on the mmapped ELF binary. So the program will have a copied-page
on its NUMA node, even if the original page in page cache is
on other NUMA nodes.

Am I missing something important or is this just absolutely not what we want?

This means that for each and every invocation of the binary, we'll COW the complete binary -- an awful lot of wasted main memory and swap.

This is not a NUMA replication, this is en executable ELF code replication.


Signed-off-by: Huang Shijie <shijie@xxxxxxxxxxxxxxxxxxxxxx>
---
  fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
  fs/exec.c                  |  5 ++++-
  include/linux/binfmts.h    |  1 +
  include/linux/mm.h         |  2 ++
  include/uapi/linux/fcntl.h |  2 ++
  mm/mprotect.c              |  2 +-
  6 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 439ed81e755a..fac8f4a4555a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
static unsigned long elf_map(struct file *filep, unsigned long addr,
  		const struct elf_phdr *eppnt, int prot, int type,
-		unsigned long total_size)
+		unsigned long total_size, int numa_replication)
  {
  	unsigned long map_addr;
  	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
  	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
  	addr = ELF_PAGESTART(addr);
  	size = ELF_PAGEALIGN(size);
+	int ret;
/* mmap() will return -EINVAL if given a zero size, but a
  	 * segment with zero filesize is perfectly valid */
@@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
  	*/
  	if (total_size) {
  		total_size = ELF_PAGEALIGN(total_size);
-		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+
+		if (numa_replication) {
+			/* Trigger the COW for this ELF code section */
+			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
+						type | MAP_POPULATE, off);
+			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
+				/* Change back */
+				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
+				if (ret)
+					return ret;
+			}
+		} else {
+			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+		}
+
  		if (!BAD_ADDR(map_addr))
  			vm_munmap(map_addr+size, total_size-size);
-	} else
+	} else {
  		map_addr = vm_mmap(filep, addr, size, prot, type, off);
+	}
if ((type & MAP_FIXED_NOREPLACE) &&
  	    PTR_ERR((void *)map_addr) == -EEXIST)
@@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
  				load_addr = -vaddr;
map_addr = elf_map(interpreter, load_addr + vaddr,
-					eppnt, elf_prot, elf_type, total_size);
+					eppnt, elf_prot, elf_type, total_size, 0);
  			total_size = 0;
  			error = map_addr;
  			if (BAD_ADDR(map_addr))
@@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
  		}
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags, total_size);
+				elf_prot, elf_flags, total_size,
+				bprm->support_numa_replication);
  		if (BAD_ADDR(error)) {
  			retval = IS_ERR((void *)error) ?
  				PTR_ERR((void*)error) : -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..d27efa540641 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
  		.lookup_flags = LOOKUP_FOLLOW,
  	};
- if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
  		return ERR_PTR(-EINVAL);
  	if (flags & AT_SYMLINK_NOFOLLOW)
  		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
@@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
  	if (retval)
  		goto out;
+ /* Do we support NUMA replication for this program? */
+	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
+
  	retval = exec_binprm(bprm);
  	if (retval < 0)
  		goto out;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 049cf9421d83..1874e1732f20 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -64,6 +64,7 @@ struct linux_binprm {
  	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
char buf[BINPRM_BUF_SIZE];
+	int support_numa_replication;
  } __randomize_layout;
#define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..76611381be2a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
  #endif
extern int sysctl_nr_trim_pages;
+int do_mprotect_pkey(unsigned long start, size_t len,
+			unsigned long prot, int pkey);
#ifdef CONFIG_PRINTK
  void mem_dump_obj(void *object);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..de99c5ae8eca 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -111,4 +111,6 @@
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +#define AT_NUMA_REPLICATION 0x10000 /* Support NUMA replication for the ELF program */
+
  #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 883e2cc85cad..d1f8cececfed 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
  /*
   * pkey==-1 when doing a legacy mprotect()
   */
-static int do_mprotect_pkey(unsigned long start, size_t len,
+int do_mprotect_pkey(unsigned long start, size_t len,
  		unsigned long prot, int pkey)
  {
  	unsigned long nstart, end, tmp, reqprot;



--
Thanks,

David / dhildenb




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux