[PATCH 7/7] binfmt: Introduce the binfmt_img exec handler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When being execve-ed the handler reads registers, mappings and provided
memory pages from image and just assigns this state on current task. This
simple functionality can be used to restore a task, whose state whas read
from e.g. /proc/<pid>/dump file before.

As I said before, the mentioned proc file format is designed to be as
simple as possible. Can (and should) be redesigned (ELF?).

Signed-off-by: Pavel Emelyanov <xemul@xxxxxxxxxxxxx>

---
 fs/Kconfig.binfmt |    6 +
 fs/Makefile       |    1 +
 fs/binfmt_img.c   |  324 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 331 insertions(+), 0 deletions(-)
 create mode 100644 fs/binfmt_img.c

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7..0b2f48e 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -161,3 +161,9 @@ config BINFMT_MISC
 	  You may say M here for module support and later load the module when
 	  you have use for it; the module is called binfmt_misc. If you
 	  don't know what to answer at this point, say Y.
+
+config BINFMT_IMG
+	tristate "Kernel support for IMG binaries"
+	depends on X86
+	help
+	  Say M/Y here to enable support for checkpoint-restore images execution
diff --git a/fs/Makefile b/fs/Makefile
index fb68c2b..8221719 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_NFSD_DEPRECATED)	+= nfsctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
+obj-$(CONFIG_BINFMT_IMG)	+= binfmt_img.o
 
 # binfmt_script is always there
 obj-y				+= binfmt_script.o
diff --git a/fs/binfmt_img.c b/fs/binfmt_img.c
new file mode 100644
index 0000000..9b09797
--- /dev/null
+++ b/fs/binfmt_img.c
@@ -0,0 +1,324 @@
+#include <linux/binfmt_img.h>
+#include <linux/module.h>
+#include <linux/binfmts.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+/*
+ * The binary handler to save and restore a single task state
+ */
+
+static int img_check_header(void *buf)
+{
+	struct binfmt_img_header *hdr = buf;
+
+	if (hdr->magic != BINFMT_IMG_MAGIC)
+		return -ENOEXEC;
+
+	if (hdr->version != BINFMT_IMG_VERS_0)
+		return -EINVAL;
+
+	return sizeof(*hdr);
+}
+
+static unsigned short decode_segment(__u16 seg)
+{
+	if (seg == CKPT_X86_SEG_NULL)
+		return 0;
+
+	if (seg == CKPT_X86_SEG_USER64_CS)
+		return __USER_CS;
+	if (seg == CKPT_X86_SEG_USER64_DS)
+		return __USER_DS;
+#ifdef CONFIG_COMPAT 
+	if (seg == CKPT_X86_SEG_USER32_CS)
+		return __USER32_CS;
+	if (seg == CKPT_X86_SEG_USER32_DS)
+		return __USER32_DS;
+#endif
+
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+	}
+	if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		return (seg << 3) | 7;
+	}
+	BUG();
+}
+
+static void decode_tls(struct desc_struct *d, __u64 val)
+{
+	d->a = (unsigned int)(val >> 32);
+	d->b = (unsigned int)(val & 0xFFFFFFFF);
+}
+
+static int img_restore_regs(struct linux_binprm *bprm, loff_t off, struct pt_regs *regs)
+{
+	int ret, i;
+	struct binfmt_regs_image regi;
+	struct thread_struct *th = &current->thread;
+	unsigned short seg;
+
+	ret = kernel_read(bprm->file, off, (char *)&regi, sizeof(regi));
+	if (ret != sizeof(regi))
+		return -EIO;
+
+	regs->r15 = regi.r15;
+	regs->r14 = regi.r14;
+	regs->r13 = regi.r13;
+	regs->r12 = regi.r12;
+	regs->r11 = regi.r11;
+	regs->r10 = regi.r10;
+	regs->r9 = regi.r9;
+	regs->r8 = regi.r8;
+	regs->ax = regi.ax;
+	regs->orig_ax = regi.orig_ax;
+	regs->bx = regi.bx;
+	regs->cx = regi.cx;
+	regs->dx = regi.dx;
+	regs->si = regi.si;
+	regs->di = regi.di;
+	regs->ip = regi.ip;
+	regs->flags = regi.flags;
+	regs->bp = regi.bp;
+	regs->sp = regi.sp;
+
+	regs->cs = decode_segment(regi.cs);
+	regs->ss = decode_segment(regi.ss);
+
+	th->usersp = regi.sp;
+	th->ds = decode_segment(regi.ds);
+	th->es = decode_segment(regi.es);
+	th->fsindex = decode_segment(regi.fsindex);
+	th->gsindex = decode_segment(regi.gsindex);
+
+	th->fs = regi.fs;
+	th->gs = regi.gs;
+
+	BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		decode_tls(&th->tls_array[i], regi.tls[i]);
+
+	load_TLS(th, smp_processor_id());
+
+	seg = th->fsindex;
+	loadsegment(fs, seg);
+	savesegment(fs, seg);
+	if (seg != th->fsindex) {
+		printk("ERROR saving fs selector want %x, has %x\n",
+				(unsigned int)th->fsindex, (unsigned int)seg);
+		return -EFAULT;
+	}
+
+	if (th->fs)
+		wrmsrl(MSR_FS_BASE, th->fs);
+	load_gs_index(th->gsindex);
+	if (th->gs)
+		wrmsrl(MSR_KERNEL_GS_BASE, th->gs);
+
+	return sizeof(regi);
+}
+
+static int img_restore_mm(struct linux_binprm *bprm, loff_t off)
+{
+	int ret;
+	struct binfmt_mm_image mmi;
+	struct mm_struct *mm = current->mm;
+
+	ret = kernel_read(bprm->file, off, (char *)&mmi, sizeof(mmi));
+	if (ret != sizeof(mmi))
+		return -EIO;
+
+	mm->flags = mmi.flags;
+	mm->def_flags = mmi.def_flags;
+	mm->start_code = mmi.start_code;
+	mm->end_code = mmi.end_code;
+	mm->start_data = mmi.start_data;
+	mm->end_data = mmi.end_data;
+	mm->start_brk = mmi.start_brk;
+	mm->brk = mmi.brk;
+	mm->start_stack = mmi.start_stack;
+	mm->arg_start = mmi.arg_start;
+	mm->arg_end = mmi.arg_end;
+	mm->env_start = mmi.env_start;
+	mm->env_end = mmi.env_end;
+
+	if (mmi.exe_fd != 0) {
+		struct file *f;
+
+		f = fget(mmi.exe_fd);
+		if (f == NULL)
+			return -EBADF;
+
+		fput(mm->exe_file);
+		mm->exe_file = f;
+	}
+
+	return sizeof(mmi);
+}
+
+static int img_restore_vmas(struct linux_binprm *bprm, loff_t off)
+{
+	int ret;
+	struct mm_struct *mm = current->mm;
+	int len = 0;
+
+	do_munmap(mm, 0, TASK_SIZE);
+
+	while (1) {
+		struct binfmt_vma_image vmai;
+		unsigned long addr;
+		struct file *file = NULL;
+
+		len += sizeof(vmai);
+
+		ret = kernel_read(bprm->file, off, (char *)&vmai, sizeof(vmai));
+		if (ret != sizeof(vmai))
+			return -EIO;
+
+		if (vmai.start == 0 && vmai.end == 0)
+			break;
+
+		if (vmai.fd != 0) {
+			file = fget(vmai.fd);
+			if (file == NULL)
+				return -EBADF;
+		} else
+			vmai.flags |= MAP_ANONYMOUS;
+
+		if (vmai.start <= mm->start_stack && vmai.end >= mm->start_stack)
+			vmai.flags |= MAP_GROWSDOWN;
+
+		addr = do_mmap_pgoff(file, vmai.start, vmai.end - vmai.start,
+				vmai.prot, vmai.flags | MAP_FIXED, vmai.pgoff);
+
+		if (vmai.fd) {
+			fput(file);
+			do_close(vmai.fd);
+		}
+
+		if ((long)addr < 0 || (addr != vmai.start))
+			return -ENXIO;
+
+		off += sizeof(vmai);
+	}
+
+	return len;
+}
+
+static int img_restore_pages(struct linux_binprm *bprm, loff_t off)
+{
+	int ret;
+	struct mm_struct *mm = current->mm;
+	int len = 0;
+
+	while (1) {
+		struct binfmt_page_image pgi;
+		struct vm_area_struct *vma;
+		struct page *page;
+		void *pg_data;
+
+		ret = kernel_read(bprm->file, off, (char *)&pgi, sizeof(pgi));
+		if (ret != sizeof(pgi))
+			return -EIO;
+
+		len += sizeof(pgi);
+		if (pgi.vaddr == 0)
+			break;
+
+		vma = find_vma(mm, pgi.vaddr);
+		if (vma == NULL)
+			return -ESRCH;
+
+		ret = get_user_pages(current, current->mm, (unsigned long)pgi.vaddr,
+				1, 1, 1, &page, NULL);
+		if (ret != 1)
+			return -EFAULT;
+
+		pg_data = kmap(page);
+		ret = kernel_read(bprm->file, off + sizeof(pgi), pg_data, PAGE_SIZE);
+		kunmap(page);
+		put_page(page);
+
+		if (ret != PAGE_SIZE)
+			return -EFAULT;
+
+		len += PAGE_SIZE;
+		off += sizeof(pgi) + PAGE_SIZE;
+	}
+
+	return len;
+}
+
+static int img_restore_mem(struct linux_binprm *bprm, loff_t off)
+{
+	int ret;
+	loff_t len = off;
+
+	ret = img_restore_mm(bprm, len);
+	if (ret < 0)
+		return ret;
+
+	len += ret;
+	ret = img_restore_vmas(bprm, len);
+	if (ret < 0)
+		return ret;
+
+	len += ret;
+	ret = img_restore_pages(bprm, len);
+	if (ret < 0)
+		return ret;
+
+	len += ret;
+	return len;
+
+}
+
+static int img_load_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+	int ret;
+	loff_t len = 0;
+
+	ret = img_check_header(bprm->buf);
+	if (ret < 0)
+		return ret;
+
+	len += ret;
+	ret = img_restore_regs(bprm, len, regs);
+	if (ret < 0)
+		return ret;
+
+	len += ret;
+	ret = img_restore_mem(bprm, len);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static struct linux_binfmt img_binfmt = {
+	.module = THIS_MODULE,
+	.load_binary = img_load_binary,
+};
+
+static __init int img_binfmt_init(void)
+{
+	return register_binfmt(&img_binfmt);
+}
+
+static __exit void img_binfmt_exit(void)
+{
+	unregister_binfmt(&img_binfmt);
+}
+
+module_init(img_binfmt_init);
+module_exit(img_binfmt_exit);
+MODULE_LICENSE("GPL");
-- 
1.5.5.6
_______________________________________________
Containers mailing list
Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linux-foundation.org/mailman/listinfo/containers


[Index of Archives]     [Cgroups]     [Netdev]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux