Restoring the memory address space begins with nuking the existing one of the current process, and then reading the VMA state and contents. Call do_mmap_pgoffset() for each VMA and then read in the data. Changelog[v14]: - Revert change to pr_debug(), back to cr_debug() - Compare saved 'vdso' field of mm_context with current value - Discard field 'h->parent' - Check whether calls to cr_hbuf_get() fail - Explicitly handle VSDO vma Changelog[v13]: - Avoid access to hh->vma_type after the header is freed - Test for no vma's in exit_mmap() before calling unmap_vma() (or it may crash if restart fails after having removed all vma's) Changelog[v12]: - Replace obsolete cr_debug() with pr_debug() Changelog[v9]: - Introduce cr_ctx_checkpoint() for checkpoint-specific ctx setup Changelog[v7]: - Fix argument given to kunmap_atomic() in memory dump/restore Changelog[v6]: - Balance all calls to cr_hbuf_get() with matching cr_hbuf_put() (even though it's not really needed) Changelog[v5]: - Improve memory restore code (following Dave Hansen's comments) - Change dump format (and code) to allow chunks of <vaddrs, pages> instead of one long list of each - Memory restore now maps user pages explicitly to copy data into them, instead of reading directly to user space; got rid of mprotect_fixup() Changelog[v4]: - Use standard list_... for cr_pgarr Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> Acked-by: Serge Hallyn <serue@xxxxxxxxxx> Signed-off-by: Dave Hansen <dave@xxxxxxxxxxxxxxxxxx> --- arch/x86/include/asm/checkpoint_hdr.h | 5 + arch/x86/mm/restart.c | 66 ++++++ checkpoint/Makefile | 2 +- checkpoint/checkpoint_arch.h | 1 + checkpoint/checkpoint_mem.h | 5 + checkpoint/restart.c | 47 ++++ checkpoint/rstr_mem.c | 402 +++++++++++++++++++++++++++++++++ checkpoint/rstr_task.c | 4 + include/linux/checkpoint.h | 10 +- mm/mmap.c | 2 +- 10 files changed, 539 insertions(+), 5 deletions(-) create mode 100644 checkpoint/rstr_mem.c diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h index 54d3a41..e9eb40c 100644 --- a/arch/x86/include/asm/checkpoint_hdr.h +++ b/arch/x86/include/asm/checkpoint_hdr.h @@ -101,4 +101,9 @@ struct cr_hdr_mm_context { __u32 nldt; } __attribute__((aligned(8))); +#ifdef __KERNEL__ +/* misc prototypes from kernel (not defined elsewhere) */ +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount); +#endif + #endif /* __ASM_X86_CKPT_HDR__H */ diff --git a/arch/x86/mm/restart.c b/arch/x86/mm/restart.c index 9353ae2..fca5cd8 100644 --- a/arch/x86/mm/restart.c +++ b/arch/x86/mm/restart.c @@ -10,10 +10,12 @@ #include <asm/desc.h> #include <asm/i387.h> +#include <asm/elf.h> #include <linux/checkpoint.h> #include <linux/checkpoint_hdr.h> + /* read the thread_struct into the current task */ int cr_read_thread(struct cr_ctx *ctx) { @@ -221,3 +223,67 @@ int cr_read_head_arch(struct cr_ctx *ctx) cr_hbuf_put(ctx, sizeof(*hh)); return ret; } + +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm) +{ + struct cr_hdr_mm_context *hh; + unsigned int n; + int ret; + + hh = cr_hbuf_get(ctx, sizeof(*hh)); + if (!hh) + return -ENOMEM; + + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM_CONTEXT); + if (ret < 0) + goto out; + + cr_debug("nldt %d vdso %#lx (%p)\n", + hh->nldt, (unsigned long) hh->vdso, mm->context.vdso); + + ret = -EINVAL; + if (hh->vdso != (unsigned long) mm->context.vdso) + goto out; + if (hh->ldt_entry_size != LDT_ENTRY_SIZE) + goto out; + + /* + * to utilize the syscall modify_ldt() we first convert the data + * in the checkpoint image from 'struct desc_struct' to 'struct + * user_desc' with reverse logic of include/asm/desc.h:fill_ldt() + */ + + for (n = 0; n < hh->nldt; n++) { + struct user_desc info; + struct desc_struct desc; + mm_segment_t old_fs; + + ret = cr_kread(ctx, &desc, LDT_ENTRY_SIZE); + if (ret < 0) + goto out; + + info.entry_number = n; + info.base_addr = desc.base0 | (desc.base1 << 16); + info.limit = desc.limit0; + info.seg_32bit = desc.d; + info.contents = desc.type >> 2; + info.read_exec_only = (desc.type >> 1) ^ 1; + info.limit_in_pages = desc.g; + info.seg_not_present = desc.p ^ 1; + info.useable = desc.avl; + + old_fs = get_fs(); + set_fs(get_ds()); + ret = sys_modify_ldt(1, (struct user_desc __user *) &info, + sizeof(info)); + set_fs(old_fs); + + if (ret < 0) + goto out; + } + + ret = 0; + out: + cr_hbuf_put(ctx, sizeof(*hh)); + return ret; +} diff --git a/checkpoint/Makefile b/checkpoint/Makefile index fddfada..c6cfd06 100644 --- a/checkpoint/Makefile +++ b/checkpoint/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_CHECKPOINT) += sys.o checkpoint.o restart.o \ ckpt_task.o rstr_task.o \ - ckpt_mem.o + ckpt_mem.o rstr_mem.o diff --git a/checkpoint/checkpoint_arch.h b/checkpoint/checkpoint_arch.h index 5168765..e43b7fe 100644 --- a/checkpoint/checkpoint_arch.h +++ b/checkpoint/checkpoint_arch.h @@ -8,3 +8,4 @@ extern int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm); extern int cr_read_head_arch(struct cr_ctx *ctx); extern int cr_read_thread(struct cr_ctx *ctx); extern int cr_read_cpu(struct cr_ctx *ctx); +extern int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm); diff --git a/checkpoint/checkpoint_mem.h b/checkpoint/checkpoint_mem.h index 3e48bc4..de1d4c8 100644 --- a/checkpoint/checkpoint_mem.h +++ b/checkpoint/checkpoint_mem.h @@ -38,4 +38,9 @@ static inline int cr_pgarr_is_full(struct cr_pgarr *pgarr) return (pgarr->nr_used == CR_PGARR_TOTAL); } +static inline int cr_pgarr_nr_free(struct cr_pgarr *pgarr) +{ + return CR_PGARR_TOTAL - pgarr->nr_used; +} + #endif /* _CHECKPOINT_CKPT_MEM_H_ */ diff --git a/checkpoint/restart.c b/checkpoint/restart.c index 3d24796..9b62404 100644 --- a/checkpoint/restart.c +++ b/checkpoint/restart.c @@ -133,6 +133,44 @@ int cr_read_string(struct cr_ctx *ctx, char *str, int len) return ret; } +/** + * cr_read_fname - read a file name + * @ctx: checkpoint context + * @fname: buffer + * @n: buffer length + */ +int cr_read_fname(struct cr_ctx *ctx, char *fname, int flen) +{ + return cr_read_buf_type(ctx, fname, &flen, CR_HDR_FNAME); +} + +/** + * cr_read_open_fname - read a file name and open a file + * @ctx: checkpoint context + * @flags: file flags + * @mode: file mode + */ +struct file *cr_read_open_fname(struct cr_ctx *ctx, int flags, int mode) +{ + struct file *file; + char *fname; + int ret; + + fname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!fname) + return ERR_PTR(-ENOMEM); + + ret = cr_read_fname(ctx, fname, PATH_MAX); + cr_debug("fname '%s' flags %#x mode %#x\n", fname, flags, mode); + if (ret >= 0) + file = filp_open(fname, flags, mode); + else + file = ERR_PTR(ret); + + kfree(fname); + return file; +} + /* read the checkpoint header */ static int cr_read_head(struct cr_ctx *ctx) { @@ -214,10 +252,19 @@ static int cr_read_tail(struct cr_ctx *ctx) return ret; } +/* setup restart-specific parts of ctx */ +static int cr_ctx_restart(struct cr_ctx *ctx) +{ + return 0; +} + int do_restart(struct cr_ctx *ctx, pid_t pid) { int ret; + ret = cr_ctx_restart(ctx); + if (ret < 0) + goto out; ret = cr_read_head(ctx); if (ret < 0) goto out; diff --git a/checkpoint/rstr_mem.c b/checkpoint/rstr_mem.c new file mode 100644 index 0000000..34024d7 --- /dev/null +++ b/checkpoint/rstr_mem.c @@ -0,0 +1,402 @@ +/* + * Restart memory contents + * + * Copyright (C) 2008-2009 Oren Laadan + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/mm_types.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/checkpoint.h> +#include <linux/checkpoint_hdr.h> + +#include "checkpoint_arch.h" +#include "checkpoint_mem.h" + +/* + * Unlike checkpoint, restart is executed in the context of each restarting + * process: vma regions are restored via a call to mmap(), and the data is + * read into the address space of the current process. + */ + +/** + * cr_read_pages_vaddrs - read addresses of pages to page-array chain + * @ctx - restart context + * @nr_pages - number of address to read + */ +static int cr_read_pages_vaddrs(struct cr_ctx *ctx, unsigned long nr_pages) +{ + struct cr_pgarr *pgarr; + unsigned long *vaddrp; + int nr, ret; + + while (nr_pages) { + pgarr = cr_pgarr_current(ctx); + if (!pgarr) + return -ENOMEM; + nr = cr_pgarr_nr_free(pgarr); + if (nr > nr_pages) + nr = nr_pages; + vaddrp = &pgarr->vaddrs[pgarr->nr_used]; + ret = cr_kread(ctx, vaddrp, nr * sizeof(unsigned long)); + if (ret < 0) + return ret; + pgarr->nr_used += nr; + nr_pages -= nr; + } + return 0; +} + +static int cr_page_read(struct cr_ctx *ctx, struct page *page, char *buf) +{ + void *ptr; + int ret; + + ret = cr_kread(ctx, buf, PAGE_SIZE); + if (ret < 0) + return ret; + + ptr = kmap_atomic(page, KM_USER1); + memcpy(ptr, buf, PAGE_SIZE); + kunmap_atomic(ptr, KM_USER1); + + return 0; +} + +/** + * cr_read_pages_contents - read in data of pages in page-array chain + * @ctx - restart context + */ +static int cr_read_pages_contents(struct cr_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + struct cr_pgarr *pgarr; + unsigned long *vaddrs; + char *buf; + int i, ret = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + down_read(&mm->mmap_sem); + list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) { + vaddrs = pgarr->vaddrs; + for (i = 0; i < pgarr->nr_used; i++) { + struct page *page; + + ret = get_user_pages(current, mm, vaddrs[i], + 1, 1, 1, &page, NULL); + if (ret < 0) + goto out; + + ret = cr_page_read(ctx, page, buf); + page_cache_release(page); + + if (ret < 0) + goto out; + } + } + + out: + up_read(&mm->mmap_sem); + kfree(buf); + return 0; +} + +/** + * cr_read_private_vma_contents - restore contents of a VMA with private memory + * @ctx - restart context + * + * Reads a header that specifies how many pages will follow, then reads + * a list of virtual addresses into ctx->pgarr_list page-array chain, + * followed by the actual contents of the corresponding pages. Iterates + * these steps until reaching a header specifying "0" pages, which marks + * the end of the contents. + */ +static int cr_read_private_vma_contents(struct cr_ctx *ctx) +{ + struct cr_hdr_pgarr *hh; + unsigned long nr_pages; + int ret; + + while (1) { + hh = cr_hbuf_get(ctx, sizeof(*hh)); + if (!hh) + return -ENOMEM; + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_PGARR); + if (ret < 0) { + cr_hbuf_put(ctx, sizeof(*hh)); + break; + } + + cr_debug("nr_pages %ld\n", (unsigned long) hh->nr_pages); + + nr_pages = hh->nr_pages; + cr_hbuf_put(ctx, sizeof(*hh)); + + if (!nr_pages) + break; + + ret = cr_read_pages_vaddrs(ctx, nr_pages); + if (ret < 0) + break; + ret = cr_read_pages_contents(ctx); + if (ret < 0) + break; + cr_pgarr_reset_all(ctx); + } + + return ret; +} + +/** + * cr_calc_map_prot_bits - convert vm_flags to mmap protection + * orig_vm_flags: source vm_flags + */ +static unsigned long cr_calc_map_prot_bits(unsigned long orig_vm_flags) +{ + unsigned long vm_prot = 0; + + if (orig_vm_flags & VM_READ) + vm_prot |= PROT_READ; + if (orig_vm_flags & VM_WRITE) + vm_prot |= PROT_WRITE; + if (orig_vm_flags & VM_EXEC) + vm_prot |= PROT_EXEC; + if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */ + vm_prot |= PROT_SEM; + + return vm_prot; +} + +/** + * cr_calc_map_flags_bits - convert vm_flags to mmap flags + * orig_vm_flags: source vm_flags + */ +static unsigned long cr_calc_map_flags_bits(unsigned long orig_vm_flags) +{ + unsigned long vm_flags = 0; + + vm_flags = MAP_FIXED; + if (orig_vm_flags & VM_GROWSDOWN) + vm_flags |= MAP_GROWSDOWN; + if (orig_vm_flags & VM_DENYWRITE) + vm_flags |= MAP_DENYWRITE; + if (orig_vm_flags & VM_EXECUTABLE) + vm_flags |= MAP_EXECUTABLE; + if (orig_vm_flags & VM_MAYSHARE) + vm_flags |= MAP_SHARED; + else + vm_flags |= MAP_PRIVATE; + + return vm_flags; +} + +/** + * cr_read_vma - read vma data, recreate it and read contents + * @ctx: checkpoint context + * @mm: memory address space + * + * (see vma subtypes in checkpoint_hdr.h) + */ +static int cr_read_vma(struct cr_ctx *ctx, struct mm_struct *mm) +{ + struct cr_hdr_vma *hh; + unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff; + unsigned long addr; + enum cr_vma_type vma_type; + struct file *file = NULL; + int ret; + + hh = cr_hbuf_get(ctx, sizeof(*hh)); + if (!hh) + return -ENOMEM; + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_VMA); + if (ret < 0) + goto err; + + cr_debug("vma %#lx-%#lx type %d\n", (unsigned long) hh->vm_start, + (unsigned long) hh->vm_end, (int) hh->vma_type); + + ret = -EINVAL; + if (hh->vm_end < hh->vm_start) + goto err; + + vm_start = hh->vm_start; + vm_pgoff = hh->vm_pgoff; + vm_size = hh->vm_end - hh->vm_start; + vm_prot = cr_calc_map_prot_bits(hh->vm_flags); + vm_flags = cr_calc_map_flags_bits(hh->vm_flags); + vma_type = hh->vma_type; + + cr_hbuf_put(ctx, sizeof(*hh)); + + switch (vma_type) { + + case CR_VMA_VDSO: /* special VDSO vma */ + if (vm_start != (unsigned long) mm->context.vdso) + return ret; + /* FIXME: uggh ... need to dig in */ + return 0; + + case CR_VMA_ANON: /* anonymous private mapping */ + if (vm_flags & VM_SHARED) + return ret; + /* + * vm_pgoff for anonymous mapping is the "global" page + * offset (namely from addr 0x0), so we force a zero + */ + vm_pgoff = 0; + break; + + case CR_VMA_FILE: /* private mapping from a file */ + if (vm_flags & VM_SHARED) + return ret; + /* + * for private mapping using 'read-only' is sufficient + */ + file = cr_read_open_fname(ctx, O_RDONLY, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + break; + + default: + return ret; + + } + + down_write(&mm->mmap_sem); + addr = do_mmap_pgoff(file, vm_start, vm_size, + vm_prot, vm_flags, vm_pgoff); + up_write(&mm->mmap_sem); + cr_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n", + vm_size, vm_prot, vm_flags, vm_pgoff, addr); + + /* the file (if opened) is now referenced by the vma */ + if (file) + filp_close(file, NULL); + + if (IS_ERR((void *) addr)) + return PTR_ERR((void *) addr); + + /* + * CR_VMA_ANON: read in memory as is + * CR_VMA_FILE: read in memory as is + * (more to follow ...) + */ + + switch (vma_type) { + case CR_VMA_ANON: + case CR_VMA_FILE: + /* standard case: read the data into the memory */ + ret = cr_read_private_vma_contents(ctx); + break; + default: + /* otherwise the compiler isn't happy ... */ + ret = -EINVAL; + break; + } + + cr_debug("vma retval %d\n", ret); + return ret; + + err: + cr_hbuf_put(ctx, sizeof(*hh)); + return ret; +} + +static int cr_destroy_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vmnext = mm->mmap; + struct vm_area_struct *vma; + const char *vma_name; + int ret; + + while (vmnext) { + vma = vmnext; + vmnext = vmnext->vm_next; + + /* VDSO will special treatment; for now - skip gracefully */ + vma_name = arch_vma_name(vma); + if (vma_name && !strcmp(vma_name, "[vdso]")) + continue; + + ret = do_munmap(mm, vma->vm_start, vma->vm_end-vma->vm_start); + if (ret < 0) { + cr_debug("c/r: restart failed do_munmap (%d)\n", ret); + return ret; + } + } + return 0; +} + +int cr_read_mm(struct cr_ctx *ctx) +{ + struct cr_hdr_mm *hh; + struct mm_struct *mm; + unsigned int nr; + int ret; + + hh = cr_hbuf_get(ctx, sizeof(*hh)); + if (!hh) + return -ENOMEM; + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM); + if (ret < 0) + goto out; + + cr_debug("map_count %d\n", hh->map_count); + + /* XXX need more sanity checks */ + + ret = -EINVAL; + if ((hh->start_code > hh->end_code) || + (hh->start_data > hh->end_data)) + goto out; + + mm = current->mm; + + /* point of no return -- destruct current mm */ + down_write(&mm->mmap_sem); + ret = cr_destroy_mm(mm); + if (ret < 0) { + up_write(&mm->mmap_sem); + goto out; + } + mm->start_code = hh->start_code; + mm->end_code = hh->end_code; + mm->start_data = hh->start_data; + mm->end_data = hh->end_data; + mm->start_brk = hh->start_brk; + mm->brk = hh->brk; + mm->start_stack = hh->start_stack; + mm->arg_start = hh->arg_start; + mm->arg_end = hh->arg_end; + mm->env_start = hh->env_start; + mm->env_end = hh->env_end; + up_write(&mm->mmap_sem); + + /* FIX: need also mm->flags */ + + for (nr = hh->map_count; nr; nr--) { + ret = cr_read_vma(ctx, mm); + if (ret < 0) + goto out; + } + + ret = cr_read_mm_context(ctx, mm); + out: + cr_hbuf_put(ctx, sizeof(*hh)); + return ret; +} diff --git a/checkpoint/rstr_task.c b/checkpoint/rstr_task.c index cd71abb..6aa9c6b 100644 --- a/checkpoint/rstr_task.c +++ b/checkpoint/rstr_task.c @@ -61,6 +61,10 @@ int cr_read_task(struct cr_ctx *ctx) cr_debug("ret %d\n", ret); if (ret < 0) goto out; + ret = cr_read_mm(ctx); + cr_debug("memory: ret %d\n", ret); + if (ret < 0) + goto out; ret = cr_read_thread(ctx); cr_debug("thread: ret %d\n", ret); if (ret < 0) diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h index 2c1bdaa..0628930 100644 --- a/include/linux/checkpoint.h +++ b/include/linux/checkpoint.h @@ -59,15 +59,19 @@ extern int cr_read_buf_type(struct cr_ctx *ctx, void *buf, int *len, int type); extern int cr_read_buffer(struct cr_ctx *ctx, void *buf, int *len); extern int cr_read_buffer_len(struct cr_ctx *ctx, void *buf, int len); extern int cr_read_string(struct cr_ctx *ctx, char *str, int len); +extern int cr_read_fname(struct cr_ctx *ctx, char *fname, int n); + +extern struct file *cr_read_open_fname(struct cr_ctx *ctx, + int flags, int mode); extern int cr_write_task(struct cr_ctx *ctx, struct task_struct *t); +extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t); extern int cr_read_task(struct cr_ctx *ctx); +extern int cr_read_mm(struct cr_ctx *ctx); extern int do_checkpoint(struct cr_ctx *ctx, pid_t pid); -extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t); - extern int do_restart(struct cr_ctx *ctx, pid_t pid); -extern int cr_read_mm(struct cr_ctx *ctx); + #define cr_debug(fmt, args...) \ pr_debug("[%d:c/r:%s] " fmt, task_pid_vnr(current), __func__, ## args) diff --git a/mm/mmap.c b/mm/mmap.c index 00ced3e..fb4df8f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2106,7 +2106,7 @@ void exit_mmap(struct mm_struct *mm) tlb = tlb_gather_mmu(mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + end = vma ? unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL) : 0; vm_unacct_memory(nr_accounted); free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); -- 1.5.4.3 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers