Serge E. Hallyn wrote: > Well, here is my current attempt at properly handling vdso for > the x86 and s390 c/r code. I was going to test with gettimeofday, > but x86 doesn't use vdso for that, and I'm still recompiling glibc > on s390 to exploit vdso for it. So what I can tell so far is that > CONFIG_COMPAT_VDSO=n is now save on x86 - the kernel vdso segment > is re-mapped from the kernel into the checkpointed location, so > the fact that the task was started with a random vdso base doesn't > matter. > > Once I get a proper testcase, I intend to show that checkpointing > a testcase which does gettimeofday(), waiting 10 seconds, and > then restarting it, will end up with bad __vdso_gettimeofday() > results without this patch, and good ones with it. It will be helpful if you split the patch into non-c/r changes (i.e. add a parameter to request mapping for vdso), following by c/r changes per architecture. Also, I noticed that it may be more complicated. For example, the function get_gate_vma() (at least on x86) test the value of context.vdso against some constant to decide on the return value. I don't know if this will break if the vdso of a task ends up being something that the system didn't expect it to be ? Oren. > > -serge > > From f79593f4b17bef93b067584c6222fa9e510ab5a7 Mon Sep 17 00:00:00 2001 > From: Serge E. Hallyn <serue@xxxxxxxxxx> > Date: Tue, 24 Mar 2009 15:07:40 -0400 > Subject: [PATCH 1/1] cr: remap vdso at original address > > Remap the vdso at the original address in the case of sys_restart. > > sys_checkpoint does not save away the vdso vma. This is done > in the arch-independent code > > sys_restart uses arch_setup_additional_pages() to request mapping > vdso at the original (checkpointed) location. This is being > done in the arch-dependent code. > > arch_setup_additional_pages() no longer takes the bprm argument, > which was not used. Instead it takes a unsigned long reqaddr, > which is a requested address. If 0, then we assume the usual > calculation for vdso base is performed. Otherwise, we ask for > the vdso to be installed at the original location (reqaddr), and > return -EINVAL if that was not possible. > > Signed-off-by: Serge E. Hallyn <serue@xxxxxxxxxx> > --- > arch/powerpc/include/asm/elf.h | 3 +-- > arch/powerpc/kernel/vdso.c | 8 +++++++- > arch/s390/include/asm/elf.h | 2 +- > arch/s390/kernel/vdso.c | 8 +++++++- > arch/s390/mm/checkpoint.c | 1 + > arch/s390/mm/restart.c | 9 ++++++++- > arch/sh/include/asm/elf.h | 3 +-- > arch/sh/kernel/vsyscall/vsyscall.c | 5 ++++- > arch/x86/include/asm/elf.h | 5 ++--- > arch/x86/mm/restart.c | 14 ++++++++++++-- > arch/x86/vdso/vdso32-setup.c | 8 ++++++-- > arch/x86/vdso/vma.c | 11 +++++++++-- > checkpoint/ckpt_mem.c | 10 ++++++++++ > fs/binfmt_elf.c | 2 +- > 14 files changed, 70 insertions(+), 19 deletions(-) > > diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h > index cd46f02..133e108 100644 > --- a/arch/powerpc/include/asm/elf.h > +++ b/arch/powerpc/include/asm/elf.h > @@ -266,8 +266,7 @@ extern int ucache_bsize; > /* vDSO has arch_setup_additional_pages */ > #define ARCH_HAS_SETUP_ADDITIONAL_PAGES > struct linux_binprm; > -extern int arch_setup_additional_pages(struct linux_binprm *bprm, > - int uses_interp); > +extern int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr); > #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); > > #endif /* __KERNEL__ */ > diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c > index ad06d5c..e91310c 100644 > --- a/arch/powerpc/kernel/vdso.c > +++ b/arch/powerpc/kernel/vdso.c > @@ -184,7 +184,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) > * This is called from binfmt_elf, we create the special vma for the > * vDSO and insert it into the mm struct tree > */ > -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr) > { > struct mm_struct *mm = current->mm; > struct page **vdso_pagelist; > @@ -210,6 +210,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > vdso_pages = vdso32_pages; > vdso_base = VDSO32_MBASE; > #endif > + if (reqaddr) > + vdso_base = req_addr; > > current->mm->context.vdso_base = 0; > > @@ -233,6 +235,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > rc = vdso_base; > goto fail_mmapsem; > } > + if (reqaddr && reqaddr != vdso_base) { > + rc = -EINVAL; > + goto fail_mmapsem; > + } > > /* > * our vma flags don't have VM_WRITE so by default, the process isn't > diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h > index 74d0bbb..49aaab8 100644 > --- a/arch/s390/include/asm/elf.h > +++ b/arch/s390/include/asm/elf.h > @@ -205,6 +205,6 @@ do { \ > struct linux_binprm; > > #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 > -int arch_setup_additional_pages(struct linux_binprm *, int); > +int arch_setup_additional_pages(int, unsigned long); > > #endif > diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c > index 690e178..e50ac7f 100644 > --- a/arch/s390/kernel/vdso.c > +++ b/arch/s390/kernel/vdso.c > @@ -184,7 +184,7 @@ static void vdso_init_cr5(void) > * This is called from binfmt_elf, we create the special vma for the > * vDSO and insert it into the mm struct tree > */ > -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr) > { > struct mm_struct *mm = current->mm; > struct page **vdso_pagelist; > @@ -214,6 +214,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > vdso_pagelist = vdso32_pagelist; > vdso_pages = vdso32_pages; > #endif > + if (reqaddr) > + vdso_base = reqaddr; > > /* > * vDSO has a problem and was disabled, just don't "enable" it for > @@ -236,6 +238,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > rc = vdso_base; > goto out_up; > } > + if (reqaddr && vdso_base != reqaddr) { > + rc = -EINVAL; > + goto out_up; > + } > > /* > * our vma flags don't have VM_WRITE so by default, the process > diff --git a/arch/s390/mm/checkpoint.c b/arch/s390/mm/checkpoint.c > index c4e3719..1c79418 100644 > --- a/arch/s390/mm/checkpoint.c > +++ b/arch/s390/mm/checkpoint.c > @@ -63,6 +63,7 @@ void cr_s390_mm(int op, struct cr_hdr_mm_context *hh, struct mm_struct *mm) > CR_COPY(op, hh->alloc_pgste, mm->context.alloc_pgste); > CR_COPY(op, hh->asce_bits, mm->context.asce_bits); > CR_COPY(op, hh->asce_limit, mm->context.asce_limit); > + CR_COPY(op, hh->vdso_base, mm->context.vdso_base); > } > > int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t) > diff --git a/arch/s390/mm/restart.c b/arch/s390/mm/restart.c > index 6892a2a..401f862 100644 > --- a/arch/s390/mm/restart.c > +++ b/arch/s390/mm/restart.c > @@ -13,6 +13,7 @@ > #include <linux/kernel.h> > #include <asm/system.h> > #include <asm/pgtable.h> > +#include <asm/elf.h> > > #include "checkpoint_s390.h" > > @@ -71,7 +72,13 @@ int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm) > } > > cr_s390_mm(CR_RST, hh, mm); > - ret = 0; > + if (mm->context.vdso_base) { > + ret = arch_setup_additional_pages(1, mm->context.vdso_base); > + printk(KERN_NOTICE "%s: resetting vdso to %lx ret %d\n", > + __func__, mm->context.vdso_base, ret); > + } > + else > + ret = 0; > out: > cr_hbuf_put(ctx, sizeof(*hh)); > return ret; > diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h > index ccb1d93..de173dc 100644 > --- a/arch/sh/include/asm/elf.h > +++ b/arch/sh/include/asm/elf.h > @@ -201,8 +201,7 @@ do { \ > /* vDSO has arch_setup_additional_pages */ > #define ARCH_HAS_SETUP_ADDITIONAL_PAGES > struct linux_binprm; > -extern int arch_setup_additional_pages(struct linux_binprm *bprm, > - int uses_interp); > +extern int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr); > > extern unsigned int vdso_enabled; > extern void __kernel_vsyscall; > diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c > index 3f7e415..232b415 100644 > --- a/arch/sh/kernel/vsyscall/vsyscall.c > +++ b/arch/sh/kernel/vsyscall/vsyscall.c > @@ -59,12 +59,15 @@ int __init vsyscall_init(void) > } > > /* Setup a VMA at program startup for the vsyscall page */ > -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr) > { > struct mm_struct *mm = current->mm; > unsigned long addr; > int ret; > > + if (reqaddr) /* no restart support for sh yet */ > + return -EINVAL; > + > down_write(&mm->mmap_sem); > addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); > if (IS_ERR_VALUE(addr)) { > diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h > index f51a3dd..cb16bf6 100644 > --- a/arch/x86/include/asm/elf.h > +++ b/arch/x86/include/asm/elf.h > @@ -324,10 +324,9 @@ do { \ > struct linux_binprm; > > #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 > -extern int arch_setup_additional_pages(struct linux_binprm *bprm, > - int uses_interp); > +extern int arch_setup_additional_pages( int uses_interp, unsigned long reqaddr); > > -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); > +extern int syscall32_setup_pages(int exstack, unsigned long reqaddr); > #define compat_arch_setup_additional_pages syscall32_setup_pages > > extern unsigned long arch_randomize_brk(struct mm_struct *mm); > diff --git a/arch/x86/mm/restart.c b/arch/x86/mm/restart.c > index c76d2b2..f5eedf6 100644 > --- a/arch/x86/mm/restart.c > +++ b/arch/x86/mm/restart.c > @@ -14,6 +14,8 @@ > #include <linux/checkpoint.h> > #include <linux/checkpoint_hdr.h> > > +#include <asm/elf.h> > + > /* read the thread_struct into the current task */ > int cr_read_thread(struct cr_ctx *ctx) > { > @@ -240,8 +242,16 @@ int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm) > hh->nldt, (unsigned long) hh->vdso, mm->context.vdso); > > ret = -EINVAL; > - if (hh->vdso != (unsigned long) mm->context.vdso) > - goto out; > + printk(KERN_NOTICE "%s: setting vdso from %p to %lx\n", > + __func__, mm->context.vdso, (unsigned long)hh->vdso); > + mm->context.vdso = (void *) hh->vdso; > + if (hh->vdso) { > + ret = arch_setup_additional_pages(1, hh->vdso); > + printk(KERN_NOTICE "%s: setting vdso to %p, ret %d\n", > + __func__, mm->context.vdso, ret); > + if (ret) > + goto out; > + } > if (hh->ldt_entry_size != LDT_ENTRY_SIZE) > goto out; > > diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c > index 1241f11..7d4e99b 100644 > --- a/arch/x86/vdso/vdso32-setup.c > +++ b/arch/x86/vdso/vdso32-setup.c > @@ -310,7 +310,7 @@ int __init sysenter_setup(void) > } > > /* Setup a VMA at program startup for the vsyscall page */ > -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr) > { > struct mm_struct *mm = current->mm; > unsigned long addr; > @@ -331,11 +331,15 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > if (compat) > addr = VDSO_HIGH_BASE; > else { > - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); > + addr = get_unmapped_area(NULL, reqaddr, PAGE_SIZE, 0, 0); > if (IS_ERR_VALUE(addr)) { > ret = addr; > goto up_fail; > } > + if (reqaddr && addr != reqaddr) { > + ret = -EINVAL; > + goto up_fail; > + } > } > > if (compat_uses_vma || !compat) { > diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c > index 9c98cc6..098b077 100644 > --- a/arch/x86/vdso/vma.c > +++ b/arch/x86/vdso/vma.c > @@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) > > /* Setup a VMA at program startup for the vsyscall page. > Not called for compat tasks */ > -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr) > { > struct mm_struct *mm = current->mm; > unsigned long addr; > @@ -108,12 +108,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) > return 0; > > down_write(&mm->mmap_sem); > - addr = vdso_addr(mm->start_stack, vdso_size); > + if (reqaddr) > + addr = reqaddr; > + else > + addr = vdso_addr(mm->start_stack, vdso_size); > addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); > if (IS_ERR_VALUE(addr)) { > ret = addr; > goto up_fail; > } > + if (reqaddr && addr != reqaddr) { > + ret = -EINVAL; > + goto up_fail; > + } > > ret = install_special_mapping(mm, addr, vdso_size, > VM_READ|VM_EXEC| > diff --git a/checkpoint/ckpt_mem.c b/checkpoint/ckpt_mem.c > index b1b50b5..e58b348 100644 > --- a/checkpoint/ckpt_mem.c > +++ b/checkpoint/ckpt_mem.c > @@ -741,6 +741,13 @@ int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t) > > hh->map_count = mm->map_count; > > + /* there's got to be a better way to do this */ > + for (vma = mm->mmap; vma; vma = vma->vm_next) { > + const char *name = arch_vma_name(vma); > + if (name && strcmp(name, "[vdso]")==0) > + hh->map_count--; > + } > + > /* FIX: need also mm->flags */ > > ret = cr_write_obj(ctx, &h, hh); > @@ -750,6 +757,9 @@ int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t) > > /* write the vma's */ > for (vma = mm->mmap; vma; vma = vma->vm_next) { > + const char *name = arch_vma_name(vma); > + if (name && strcmp(name, "[vdso]")==0) > + continue; > ret = cr_write_vma(ctx, vma); > if (ret < 0) > goto out; > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c > index 33b7235..02168b5 100644 > --- a/fs/binfmt_elf.c > +++ b/fs/binfmt_elf.c > @@ -961,7 +961,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) > set_binfmt(&elf_format); > > #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES > - retval = arch_setup_additional_pages(bprm, !!elf_interpreter); > + retval = arch_setup_additional_pages(!!elf_interpreter, 0); > if (retval < 0) { > send_sig(SIGKILL, current, 0); > goto out; _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers