On Fri, Jul 15, 2011 at 05:47:44PM +0400, Pavel Emelyanov wrote: > An image read from file contains task's registers and information > about its VM. Later this image can be execve-ed causing recreation > of the previously read task state. > > The file format is my own, very simple. Introduced to make the code > as simple as possible. Better file format (if any) is to be discussed. I think file format should be per-binfmt, similar to core dump. So it will be ELF with ELF binary. Core dumper code can be reused in some way. > Signed-off-by: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> > > --- > fs/proc/Kconfig | 8 + > fs/proc/Makefile | 1 + > fs/proc/base.c | 3 + > fs/proc/img_dump.c | 397 ++++++++++++++++++++++++++++++++++++++++++++ > include/linux/binfmt_img.h | 87 ++++++++++ > include/linux/proc_fs.h | 2 + > 6 files changed, 498 insertions(+), 0 deletions(-) > create mode 100644 fs/proc/img_dump.c > create mode 100644 include/linux/binfmt_img.h > > diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig > index 15af622..c64bf75 100644 > --- a/fs/proc/Kconfig > +++ b/fs/proc/Kconfig > @@ -67,3 +67,11 @@ config PROC_PAGE_MONITOR > /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, > /proc/kpagecount, and /proc/kpageflags. Disabling these > interfaces will reduce the size of the kernel by approximately 4kb. > + > +config PROC_IMG > + default y > + depends on PROC_FS depends on X86_64 ? >+ bool "Enable /proc/<pid>/dump file" > + help > + Say Y here if you want to be able to produce checkpoint-restore images > + for tasks via proc > diff --git a/fs/proc/Makefile b/fs/proc/Makefile > index df434c5..3a59cb1 100644 > --- a/fs/proc/Makefile > +++ b/fs/proc/Makefile > @@ -27,3 +27,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o > proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o > proc-$(CONFIG_PRINTK) += kmsg.o > proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o > +proc-$(CONFIG_PROC_IMG) += img_dump.o > diff --git a/fs/proc/base.c b/fs/proc/base.c > index 633af12..c01438f 100644 > --- a/fs/proc/base.c > +++ b/fs/proc/base.c > @@ -3044,6 +3044,9 @@ static const struct pid_entry tgid_base_stuff[] = { > #endif > INF("cmdline", S_IRUGO, proc_pid_cmdline), > ONE("stat", S_IRUGO, proc_tgid_stat), > +#ifdef CONFIG_PROC_IMG > + REG("dump", S_IRUSR|S_IWUSR, proc_pid_dump_operations), > +#endif Writable? > ONE("statm", S_IRUGO, proc_pid_statm), > REG("maps", S_IRUGO, proc_maps_operations), > #ifdef CONFIG_NUMA > diff --git a/fs/proc/img_dump.c b/fs/proc/img_dump.c > new file mode 100644 > index 0000000..7fa52ef > --- /dev/null > +++ b/fs/proc/img_dump.c > @@ -0,0 +1,397 @@ > +#include <linux/proc_fs.h> > +#include <linux/sched.h> > +#include <linux/uaccess.h> > +#include <linux/binfmt_img.h> > +#include <linux/mm.h> > +#include <linux/mman.h> > +#include <linux/highmem.h> > +#include <linux/types.h> > +#include "internal.h" > + > +static int img_dump_buffer(char __user *ubuf, size_t size, void *buf, int len, int pos) > +{ > + int ret; > + static size_t dumped = 0; > + > + len -= pos; > + if (len > size) > + len = size; > + > + ret = copy_to_user(ubuf, buf + pos, len); > + if (ret) > + return -EFAULT; > + > + dumped += len; > + return len; > +} > + > +static int img_dump_header(char __user *buf, size_t size, int pos) > +{ > + struct binfmt_img_header hdr; > + > + hdr.magic = BINFMT_IMG_MAGIC; > + hdr.version = BINFMT_IMG_VERS_0; > + > + return img_dump_buffer(buf, size, &hdr, sizeof(hdr), pos); > +} > + > +static __u16 encode_segment(unsigned short seg) > +{ > + if (seg == 0) > + return CKPT_X86_SEG_NULL; > + BUG_ON((seg & 3) != 3); > + > + if (seg == __USER_CS) > + return CKPT_X86_SEG_USER64_CS; > + if (seg == __USER_DS) > + return CKPT_X86_SEG_USER64_DS; > +#ifdef CONFIG_COMPAT > + if (seg == __USER32_CS) > + return CKPT_X86_SEG_USER32_CS; > + if (seg == __USER32_DS) > + return CKPT_X86_SEG_USER32_DS; > +#endif > + > + if (seg & 4) > + return CKPT_X86_SEG_LDT | (seg >> 3); > + > + seg >>= 3; > + if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX) > + return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN); > + > + printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg); > + BUG(); > +} > + > +static __u64 encode_tls(struct desc_struct *d) > +{ > + return ((__u64)d->a << 32) + d->b; > +} > + > +static int img_dump_regs(struct task_struct *p, char __user *buf, size_t size, int pos) > +{ > + struct binfmt_regs_image regi; > + struct pt_regs *regs; > + int i; > + > + regs = task_pt_regs(p); > + > + regi.r15 = regs->r15; > + regi.r14 = regs->r14; > + regi.r13 = regs->r13; > + regi.r12 = regs->r12; > + regi.r11 = regs->r11; > + regi.r10 = regs->r10; > + regi.r9 = regs->r9; > + regi.r8 = regs->r8; > + regi.ax = regs->ax; > + regi.orig_ax = regs->orig_ax; > + regi.bx = regs->bx; > + regi.cx = regs->cx; > + regi.dx = regs->dx; > + regi.si = regs->si; > + regi.di = regs->di; > + regi.ip = regs->ip; > + regi.flags = regs->flags; > + regi.bp = regs->bp; > + regi.sp = regs->sp; > + > + /* segments */ > + regi.gsindex = encode_segment(p->thread.gsindex); > + regi.fsindex = encode_segment(p->thread.fsindex); > + regi.cs = encode_segment(regs->cs); > + regi.ss = encode_segment(regs->ss); > + regi.ds = encode_segment(p->thread.ds); > + regi.es = encode_segment(p->thread.es); > + > + BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES); > + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) > + regi.tls[i] = encode_tls(&p->thread.tls_array[i]); > + > + if (p->thread.gsindex) > + regi.gs = 0; > + else > + regi.gs = p->thread.gs; > + > + if (p->thread.fsindex) > + regi.fs = 0; > + else > + regi.fs = p->thread.fs; > + > + return img_dump_buffer(buf, size, ®i, sizeof(regi), pos); > +} > + > +static int img_dump_mm(struct mm_struct *mm, char __user *buf, size_t size, int pos) > +{ > + struct binfmt_mm_image mmi; > + > + mmi.flags = mm->flags; > + mmi.def_flags = mm->def_flags; > + mmi.start_code = mm->start_code; > + mmi.end_code = mm->end_code; > + mmi.start_data = mm->start_data; > + mmi.end_data = mm->end_data; > + mmi.start_brk = mm->start_brk; > + mmi.brk = mm->brk; > + mmi.start_stack = mm->start_stack; > + mmi.arg_start = mm->arg_start; > + mmi.arg_end = mm->arg_end; > + mmi.env_start = mm->env_start; > + mmi.env_end = mm->env_end; > + mmi.exe_fd = 0; > + > + return img_dump_buffer(buf, size, &mmi, sizeof(mmi), pos); > +} > + > +static int img_dump_vma(struct vm_area_struct *vma, char __user *buf, size_t size, int pos) > +{ > + struct binfmt_vma_image vmai; > + > + if (vma == NULL) { > + memset(&vmai, 0, sizeof(vmai)); > + goto dumpit; > + } > + > + printk("Dumping vma %016lx-%016lx %p/%p\n", vma->vm_start, vma->vm_end, vma, vma->vm_mm); > + > + vmai.fd = 0; > + vmai.prot = 0; > + if (vma->vm_flags & VM_READ) > + vmai.prot |= PROT_READ; > + if (vma->vm_flags & VM_WRITE) > + vmai.prot |= PROT_WRITE; > + if (vma->vm_flags & VM_EXEC) > + vmai.prot |= PROT_EXEC; > + > + vmai.flags = 0; > + if (vma->vm_file == NULL) > + vmai.flags |= MAP_ANONYMOUS; > + if (vma->vm_flags & VM_MAYSHARE) > + vmai.flags |= MAP_SHARED; > + else > + vmai.flags |= MAP_PRIVATE; > + > + vmai.start = vma->vm_start; > + vmai.end = vma->vm_end; > + vmai.pgoff = vma->vm_pgoff; > + > +dumpit: > + return img_dump_buffer(buf, size, &vmai, sizeof(vmai), pos); > +} > + > +static int img_dump_page(unsigned long addr, void *data, char __user *buf, size_t size, int pos) > +{ > + struct binfmt_page_image pgi; > + int ret = 0, tmp; > + > + pgi.vaddr = addr; > + > + if (pos < sizeof(pgi)) { > + tmp = img_dump_buffer(buf, size, &pgi, sizeof(pgi), pos); > + if (tmp < 0) > + return tmp; > + > + ret = tmp; > + if (size <= ret) > + return ret; > + > + buf += ret; > + size -= ret; > + pos = 0; > + } else > + pos -= sizeof(pgi); > + > + tmp = img_dump_buffer(buf, size, data, PAGE_SIZE, pos); > + if (tmp < 0) > + return tmp; > + > + return ret + tmp; > +} > + > +static inline int is_private_vma(struct vm_area_struct *vma) > +{ > + if (vma->vm_file == NULL) > + return 1; > + if (!(vma->vm_flags & VM_SHARED)) > + return 1; > + return 0; > +} > + > +static ssize_t do_produce_dump(struct task_struct *p, char __user *buf, > + size_t size, loff_t *ppos) > +{ > + size_t img_pos = 0, img_ppos; > + size_t produced = 0; > + int len; > + loff_t pos = *ppos; > + struct mm_struct *mm; > + struct vm_area_struct *vma; > + > +#define move_pos(); do { \ > + buf += len; \ > + produced += len;\ > + size -= len; \ > + pos += len; \ > + } while (0) > + > +#define seek_pos(__size); do { \ > + img_ppos = img_pos; \ > + img_pos += (__size); \ > + } while (0) > + > + /* header */ > + seek_pos(sizeof(struct binfmt_img_header)); > + if (pos < img_pos) { > + len = img_dump_header(buf, size, pos - img_ppos); > + if (len < 0) > + goto err; > + > + move_pos(); > + if (size == 0) > + goto out; > + } > + > + /* registers */ > + seek_pos(sizeof(struct binfmt_regs_image)); > + if (pos < img_pos) { > + len = img_dump_regs(p, buf, size, pos - img_ppos); > + if (len < 0) > + goto err; > + > + move_pos(); > + if (size == 0) > + goto out; > + } > + > + /* memory */ > + mm = get_task_mm(p); > + if (mm == NULL) > + return -EACCES; > + > + down_read(&mm->mmap_sem); > + > + seek_pos(sizeof(struct binfmt_mm_image)); > + if (pos < img_pos) { > + len = img_dump_mm(mm, buf, size, pos - img_ppos); > + if (len < 0) > + goto err_mm; > + > + move_pos(); > + if (size == 0) > + goto out_mm; > + } > + > + vma = mm->mmap; > + while (1) { > + seek_pos(sizeof(struct binfmt_vma_image)); > + if (pos < img_pos) { > + len = img_dump_vma(vma, buf, size, pos - img_ppos); > + if (len < 0) > + goto err_mm; > + > + move_pos(); > + if (size == 0) > + goto out_mm; > + } > + > + if (vma == NULL) > + break; > + > + vma = vma->vm_next; > + } > + > + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { > + /* slow and stupid */ > + unsigned long addr; > + struct page *page; > + void *pg_data; > + > + if (!is_private_vma(vma)) > + continue; > + > + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { > + page = follow_page(vma, addr, FOLL_FORCE | FOLL_DUMP | FOLL_GET); > + if (page == NULL) > + continue; > + if (IS_ERR(page)) /* huh? */ > + continue; > + > + seek_pos(sizeof(struct binfmt_page_image) + PAGE_SIZE); > + if (pos < img_pos) { > + pg_data = kmap(page); > + len = img_dump_page(addr, pg_data, buf, size, pos - img_ppos); > + kunmap(page); > + > + if (len < 0) { > + put_page(page); > + goto err_mm; > + } > + > + move_pos(); > + if (size == 0) { > + put_page(page); > + goto out_mm; > + } > + } > + > + put_page(page); > + } > + } > + > + seek_pos(sizeof(struct binfmt_page_image)); > + if (pos < img_pos) { > + struct binfmt_page_image zero; > + > + memset(&zero, 0, sizeof(zero)); > + len = img_dump_buffer(buf, size, &zero, sizeof(zero), pos - img_ppos); > + if (len < 0) > + goto err; > + > + move_pos(); > + } > + > +out_mm: > + up_read(&mm->mmap_sem); > + mmput(mm); > +out: > + *ppos = pos; > + return produced; > + > +err_mm: > + up_read(&mm->mmap_sem); > + mmput(mm); > +err: > + return len; > +} > + > +static ssize_t img_dump_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) > +{ > + struct task_struct *p; > + > + p = get_proc_task(file->f_dentry->d_inode); > + if (p == NULL) > + return -ESRCH; > + > + if (!(p->state & TASK_STOPPED)) { > + put_task_struct(p); > + return -EINVAL; > + } > + > + return do_produce_dump(p, buf, size, ppos); > +} > + > +static int img_dump_open(struct inode *inode, struct file *filp) > +{ > + return 0; > +} > + > +static int img_dump_release(struct inode *inode, struct file *filp) > +{ > + return 0; > +} > + > +const struct file_operations proc_pid_dump_operations = { > + .open = img_dump_open, > + .read = img_dump_read, > + .release = img_dump_release, > +}; > diff --git a/include/linux/binfmt_img.h b/include/linux/binfmt_img.h > new file mode 100644 > index 0000000..a4293af > --- /dev/null > +++ b/include/linux/binfmt_img.h > @@ -0,0 +1,87 @@ > +#ifndef __BINFMT_IMG_H__ > +#define __BINFMT_IMG_H__ > + > +#include <linux/types.h> > + > +struct binfmt_img_header { > + __u32 magic; > + __u32 version; > +}; > + > +#define CKPT_TLS_ENTRIES 3 > + > +struct binfmt_regs_image { > + __u64 r15; > + __u64 r14; > + __u64 r13; > + __u64 r12; > + __u64 r11; > + __u64 r10; > + __u64 r9; > + __u64 r8; > + __u64 ax; > + __u64 orig_ax; > + __u64 bx; > + __u64 cx; > + __u64 dx; > + __u64 si; > + __u64 di; > + __u64 ip; > + __u64 flags; > + __u64 bp; > + __u64 sp; > + > + __u64 gs; > + __u64 fs; > + __u64 tls[CKPT_TLS_ENTRIES]; > + __u16 gsindex; > + __u16 fsindex; > + __u16 cs; > + __u16 ss; > + __u16 ds; > + __u16 es; > +}; > + > +#define CKPT_X86_SEG_NULL 0 > +#define CKPT_X86_SEG_USER32_CS 1 > +#define CKPT_X86_SEG_USER32_DS 2 > +#define CKPT_X86_SEG_USER64_CS 3 > +#define CKPT_X86_SEG_USER64_DS 4 > +#define CKPT_X86_SEG_TLS 0x4000 > +#define CKPT_X86_SEG_LDT 0x8000 > + > +struct binfmt_mm_image { > + __u64 flags; > + __u64 def_flags; > + __u64 start_code; > + __u64 end_code; > + __u64 start_data; > + __u64 end_data; > + __u64 start_brk; > + __u64 brk; > + __u64 start_stack; > + __u64 arg_start; > + __u64 arg_end; > + __u64 env_start; > + __u64 env_end; > + __u32 exe_fd; > +}; > + > +struct binfmt_vma_image { > + __u32 prot; > + __u32 flags; > + __u32 pad; > + __u32 fd; > + __u64 start; > + __u64 end; > + __u64 pgoff; > +}; > + > +struct binfmt_page_image { > + __u64 vaddr; > +}; > + > +#define BINFMT_IMG_MAGIC 0xa75b8d43 > +#define BINFMT_IMG_VERS_0 0x00000100 > + > +#endif > diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h > index c779c74..686b374 100644 > --- a/include/linux/proc_fs.h > +++ b/include/linux/proc_fs.h > @@ -102,6 +102,8 @@ struct vmcore { > > #ifdef CONFIG_PROC_FS > > +extern const struct file_operations proc_pid_dump_operations; > + > extern void proc_root_init(void); > > void proc_flush_task(struct task_struct *task); > -- > 1.5.5.6 > _______________________________________________ > Containers mailing list > Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx > https://lists.linux-foundation.org/mailman/listinfo/containers -- Kirill A. Shutemov _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers