As it has been discussed on timens RFC, adding a new conditional branch `if (inside_time_ns)` on VDSO for all processes is undesirable. It will add a penalty for everybody as branch predictor may mispredict the jump. Also there are instruction cache lines wasted on cmp/jmp. Those effects of introducing time namespace are very much unwanted having in mind how much work have been spent on micro-optimisation vdso code. Addressing those problems, there are two versions of VDSO's .so: for host tasks (without any penalty) and for processes inside of time namespace with clk_to_ns() that subtracts offsets from host's time. Whenever a user does setns() or unshare(CLONE_TIMENS) followed by clone(), change VDSO image in mm and zap VVAR/VDSO page tables. They will be re-faulted with corresponding image and VVAR offsets. Co-developed-by: Andrei Vagin <avagin@xxxxxxxxx> Signed-off-by: Andrei Vagin <avagin@xxxxxxxxx> Signed-off-by: Dmitry Safonov <dima@xxxxxxxxxx> --- arch/x86/entry/vdso/vma.c | 23 +++++++++++++++++++++++ arch/x86/include/asm/vdso.h | 1 + kernel/time_namespace.c | 11 +++++++++++ 3 files changed, 35 insertions(+) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 8a8211fd4cfc..91cf5a5c8c9e 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -25,6 +25,7 @@ #include <asm/cpufeature.h> #include <clocksource/hyperv_timer.h> #include <asm/page.h> +#include <asm/tlb.h> #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; @@ -266,6 +267,28 @@ static const struct vm_special_mapping vvar_mapping = { .mremap = vvar_mremap, }; +#ifdef CONFIG_TIME_NS +int vdso_join_timens(struct task_struct *task) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, &vvar_mapping) || + vma_is_special_mapping(vma, &vdso_mapping)) + zap_page_range(vma, vma->vm_start, size); + } + + up_write(&mm->mmap_sem); + return 0; +} +#endif + /* * Add vdso and vvar mappings to current process. * @image - blob to map diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 03f468c63a24..ccf89dedd04f 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -45,6 +45,7 @@ extern struct vdso_image vdso_image_32; extern void __init init_vdso_image(struct vdso_image *image); extern int map_vdso_once(const struct vdso_image *image, unsigned long addr); +extern int vdso_join_timens(struct task_struct *task); #endif /* __ASSEMBLER__ */ diff --git a/kernel/time_namespace.c b/kernel/time_namespace.c index 9807c5c90cb2..4b2eb92ad595 100644 --- a/kernel/time_namespace.c +++ b/kernel/time_namespace.c @@ -15,6 +15,7 @@ #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> +#include <asm/vdso.h> ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) @@ -199,6 +200,7 @@ static void timens_put(struct ns_common *ns) static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) { struct time_namespace *ns = to_time_ns(new); + int ret; if (!thread_group_empty(current)) return -EINVAL; @@ -207,6 +209,10 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + ret = vdso_join_timens(current); + if (ret) + return ret; + get_time_ns(ns); get_time_ns(ns); put_time_ns(nsproxy->time_ns); @@ -221,10 +227,15 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); + int ret; if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; + ret = vdso_join_timens(tsk); + if (ret) + return ret; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; -- 2.22.0