When being execve-ed the handler reads registers, mappings and provided memory pages from image and just assigns this state on current task. This simple functionality can be used to restore a task, whose state whas read from e.g. /proc/<pid>/dump file before. As I said before, the mentioned proc file format is designed to be as simple as possible. Can (and should) be redesigned (ELF?). Signed-off-by: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> --- fs/Kconfig.binfmt | 6 + fs/Makefile | 1 + fs/binfmt_img.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 331 insertions(+), 0 deletions(-) create mode 100644 fs/binfmt_img.c diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 79e2ca7..0b2f48e 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -161,3 +161,9 @@ config BINFMT_MISC You may say M here for module support and later load the module when you have use for it; the module is called binfmt_misc. If you don't know what to answer at this point, say Y. + +config BINFMT_IMG + tristate "Kernel support for IMG binaries" + depends on X86 + help + Say M/Y here to enable support for checkpoint-restore images execution diff --git a/fs/Makefile b/fs/Makefile index fb68c2b..8221719 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o +obj-$(CONFIG_BINFMT_IMG) += binfmt_img.o # binfmt_script is always there obj-y += binfmt_script.o diff --git a/fs/binfmt_img.c b/fs/binfmt_img.c new file mode 100644 index 0000000..9b09797 --- /dev/null +++ b/fs/binfmt_img.c @@ -0,0 +1,324 @@ +#include <linux/binfmt_img.h> +#include <linux/module.h> +#include <linux/binfmts.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/highmem.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> + +/* + * The binary handler to save and restore a single task state + */ + +static int img_check_header(void *buf) +{ + struct binfmt_img_header *hdr = buf; + + if (hdr->magic != BINFMT_IMG_MAGIC) + return -ENOEXEC; + + if (hdr->version != BINFMT_IMG_VERS_0) + return -EINVAL; + + return sizeof(*hdr); +} + +static unsigned short decode_segment(__u16 seg) +{ + if (seg == CKPT_X86_SEG_NULL) + return 0; + + if (seg == CKPT_X86_SEG_USER64_CS) + return __USER_CS; + if (seg == CKPT_X86_SEG_USER64_DS) + return __USER_DS; +#ifdef CONFIG_COMPAT + if (seg == CKPT_X86_SEG_USER32_CS) + return __USER32_CS; + if (seg == CKPT_X86_SEG_USER32_DS) + return __USER32_DS; +#endif + + if (seg & CKPT_X86_SEG_TLS) { + seg &= ~CKPT_X86_SEG_TLS; + return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3; + } + if (seg & CKPT_X86_SEG_LDT) { + seg &= ~CKPT_X86_SEG_LDT; + return (seg << 3) | 7; + } + BUG(); +} + +static void decode_tls(struct desc_struct *d, __u64 val) +{ + d->a = (unsigned int)(val >> 32); + d->b = (unsigned int)(val & 0xFFFFFFFF); +} + +static int img_restore_regs(struct linux_binprm *bprm, loff_t off, struct pt_regs *regs) +{ + int ret, i; + struct binfmt_regs_image regi; + struct thread_struct *th = ¤t->thread; + unsigned short seg; + + ret = kernel_read(bprm->file, off, (char *)®i, sizeof(regi)); + if (ret != sizeof(regi)) + return -EIO; + + regs->r15 = regi.r15; + regs->r14 = regi.r14; + regs->r13 = regi.r13; + regs->r12 = regi.r12; + regs->r11 = regi.r11; + regs->r10 = regi.r10; + regs->r9 = regi.r9; + regs->r8 = regi.r8; + regs->ax = regi.ax; + regs->orig_ax = regi.orig_ax; + regs->bx = regi.bx; + regs->cx = regi.cx; + regs->dx = regi.dx; + regs->si = regi.si; + regs->di = regi.di; + regs->ip = regi.ip; + regs->flags = regi.flags; + regs->bp = regi.bp; + regs->sp = regi.sp; + + regs->cs = decode_segment(regi.cs); + regs->ss = decode_segment(regi.ss); + + th->usersp = regi.sp; + th->ds = decode_segment(regi.ds); + th->es = decode_segment(regi.es); + th->fsindex = decode_segment(regi.fsindex); + th->gsindex = decode_segment(regi.gsindex); + + th->fs = regi.fs; + th->gs = regi.gs; + + BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES); + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + decode_tls(&th->tls_array[i], regi.tls[i]); + + load_TLS(th, smp_processor_id()); + + seg = th->fsindex; + loadsegment(fs, seg); + savesegment(fs, seg); + if (seg != th->fsindex) { + printk("ERROR saving fs selector want %x, has %x\n", + (unsigned int)th->fsindex, (unsigned int)seg); + return -EFAULT; + } + + if (th->fs) + wrmsrl(MSR_FS_BASE, th->fs); + load_gs_index(th->gsindex); + if (th->gs) + wrmsrl(MSR_KERNEL_GS_BASE, th->gs); + + return sizeof(regi); +} + +static int img_restore_mm(struct linux_binprm *bprm, loff_t off) +{ + int ret; + struct binfmt_mm_image mmi; + struct mm_struct *mm = current->mm; + + ret = kernel_read(bprm->file, off, (char *)&mmi, sizeof(mmi)); + if (ret != sizeof(mmi)) + return -EIO; + + mm->flags = mmi.flags; + mm->def_flags = mmi.def_flags; + mm->start_code = mmi.start_code; + mm->end_code = mmi.end_code; + mm->start_data = mmi.start_data; + mm->end_data = mmi.end_data; + mm->start_brk = mmi.start_brk; + mm->brk = mmi.brk; + mm->start_stack = mmi.start_stack; + mm->arg_start = mmi.arg_start; + mm->arg_end = mmi.arg_end; + mm->env_start = mmi.env_start; + mm->env_end = mmi.env_end; + + if (mmi.exe_fd != 0) { + struct file *f; + + f = fget(mmi.exe_fd); + if (f == NULL) + return -EBADF; + + fput(mm->exe_file); + mm->exe_file = f; + } + + return sizeof(mmi); +} + +static int img_restore_vmas(struct linux_binprm *bprm, loff_t off) +{ + int ret; + struct mm_struct *mm = current->mm; + int len = 0; + + do_munmap(mm, 0, TASK_SIZE); + + while (1) { + struct binfmt_vma_image vmai; + unsigned long addr; + struct file *file = NULL; + + len += sizeof(vmai); + + ret = kernel_read(bprm->file, off, (char *)&vmai, sizeof(vmai)); + if (ret != sizeof(vmai)) + return -EIO; + + if (vmai.start == 0 && vmai.end == 0) + break; + + if (vmai.fd != 0) { + file = fget(vmai.fd); + if (file == NULL) + return -EBADF; + } else + vmai.flags |= MAP_ANONYMOUS; + + if (vmai.start <= mm->start_stack && vmai.end >= mm->start_stack) + vmai.flags |= MAP_GROWSDOWN; + + addr = do_mmap_pgoff(file, vmai.start, vmai.end - vmai.start, + vmai.prot, vmai.flags | MAP_FIXED, vmai.pgoff); + + if (vmai.fd) { + fput(file); + do_close(vmai.fd); + } + + if ((long)addr < 0 || (addr != vmai.start)) + return -ENXIO; + + off += sizeof(vmai); + } + + return len; +} + +static int img_restore_pages(struct linux_binprm *bprm, loff_t off) +{ + int ret; + struct mm_struct *mm = current->mm; + int len = 0; + + while (1) { + struct binfmt_page_image pgi; + struct vm_area_struct *vma; + struct page *page; + void *pg_data; + + ret = kernel_read(bprm->file, off, (char *)&pgi, sizeof(pgi)); + if (ret != sizeof(pgi)) + return -EIO; + + len += sizeof(pgi); + if (pgi.vaddr == 0) + break; + + vma = find_vma(mm, pgi.vaddr); + if (vma == NULL) + return -ESRCH; + + ret = get_user_pages(current, current->mm, (unsigned long)pgi.vaddr, + 1, 1, 1, &page, NULL); + if (ret != 1) + return -EFAULT; + + pg_data = kmap(page); + ret = kernel_read(bprm->file, off + sizeof(pgi), pg_data, PAGE_SIZE); + kunmap(page); + put_page(page); + + if (ret != PAGE_SIZE) + return -EFAULT; + + len += PAGE_SIZE; + off += sizeof(pgi) + PAGE_SIZE; + } + + return len; +} + +static int img_restore_mem(struct linux_binprm *bprm, loff_t off) +{ + int ret; + loff_t len = off; + + ret = img_restore_mm(bprm, len); + if (ret < 0) + return ret; + + len += ret; + ret = img_restore_vmas(bprm, len); + if (ret < 0) + return ret; + + len += ret; + ret = img_restore_pages(bprm, len); + if (ret < 0) + return ret; + + len += ret; + return len; + +} + +static int img_load_binary(struct linux_binprm * bprm, struct pt_regs * regs) +{ + int ret; + loff_t len = 0; + + ret = img_check_header(bprm->buf); + if (ret < 0) + return ret; + + len += ret; + ret = img_restore_regs(bprm, len, regs); + if (ret < 0) + return ret; + + len += ret; + ret = img_restore_mem(bprm, len); + if (ret < 0) + return ret; + + return 0; +} + +static struct linux_binfmt img_binfmt = { + .module = THIS_MODULE, + .load_binary = img_load_binary, +}; + +static __init int img_binfmt_init(void) +{ + return register_binfmt(&img_binfmt); +} + +static __exit void img_binfmt_exit(void) +{ + unregister_binfmt(&img_binfmt); +} + +module_init(img_binfmt_init); +module_exit(img_binfmt_exit); +MODULE_LICENSE("GPL"); -- 1.5.5.6 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers