Couple of nits: Oren Laadan [orenl@xxxxxxxxxxxxxxx] wrote: | From f5dec38baa6a2cc2a88783db3a9afd676821d293 Mon Sep 17 00:00:00 2001 | From: Oren Laadan <orenl@xxxxxxxxxxxxxxx> | Date: Mon, 30 Mar 2009 11:14:32 -0400 | Subject: [PATCH 05/29] x86 support for checkpoint/restart | | Add logic to save and restore architecture specific state, including | thread-specific state, CPU registers and FPU state. | | In addition, architecture capabilities are saved in an architecure | specific extension of the header (cr_hdr_head_arch); Currently this | includes only FPU capabilities. | | Currently only x86-32 is supported. Compiling on x86-64 will trigger | an explicit error. | | Changelog[v14]: | - Remove preempt_disable/enable() around init_fpu() and fix leak | - Revert change to pr_debug(), back to cr_debug() | - Use only unsigned fields in checkpoint headers | - Discard field 'h->parent' | - Check whether calls to cr_hbuf_get() fail | | Changelog[v12]: | - A couple of missed calls to cr_hbuf_put() | - Replace obsolete cr_debug() with pr_debug() | | Changelog[v9]: | - Add arch-specific header that details architecture capabilities; | split FPU restore to send capabilities only once. | - Test for zero TLS entries in cr_write_thread() | - Fix asm/checkpoint_hdr.h so it can be included from user-space | | Changelog[v7]: | - Fix save/restore state of FPU | | Changelog[v5]: | - Remove preempt_disable() when restoring debug registers | | Changelog[v4]: | - Fix header structure alignment | | Changelog[v2]: | - Pad header structures to 64 bits to ensure compatibility | - Follow Dave Hansen's refactoring of the original post | | Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> | Acked-by: Serge Hallyn <serue@xxxxxxxxxx> | Signed-off-by: Dave Hansen <dave@xxxxxxxxxxxxxxxxxx> | --- | arch/x86/include/asm/checkpoint_hdr.h | 98 +++++++++++++ | arch/x86/mm/Makefile | 2 + | arch/x86/mm/checkpoint.c | 242 +++++++++++++++++++++++++++++++++ | arch/x86/mm/restart.c | 223 ++++++++++++++++++++++++++++++ | checkpoint/checkpoint.c | 19 ++- | checkpoint/checkpoint_arch.h | 9 ++ | checkpoint/restart.c | 17 ++- | include/linux/checkpoint_hdr.h | 2 + | 8 files changed, 608 insertions(+), 4 deletions(-) | create mode 100644 arch/x86/include/asm/checkpoint_hdr.h | create mode 100644 arch/x86/mm/checkpoint.c | create mode 100644 arch/x86/mm/restart.c | create mode 100644 checkpoint/checkpoint_arch.h | | diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h | new file mode 100644 | index 0000000..ffdb5f5 | --- /dev/null | +++ b/arch/x86/include/asm/checkpoint_hdr.h | @@ -0,0 +1,98 @@ | +#ifndef __ASM_X86_CKPT_HDR_H | +#define __ASM_X86_CKPT_HDR_H | +/* | + * Checkpoint/restart - architecture specific headers x86 | + * | + * Copyright (C) 2008-2009 Oren Laadan | + * | + * This file is subject to the terms and conditions of the GNU General Public | + * License. See the file COPYING in the main directory of the Linux | + * distribution for more details. | + */ | + | +#include <linux/types.h> | + | +/* | + * To maintain compatibility between 32-bit and 64-bit architecture flavors, | + * keep data 64-bit aligned: use padding for structure members, and use | + * __attribute__ ((aligned (8))) for the entire structure. | + * | + * Quoting Arnd Bergmann: | + * "This structure has an odd multiple of 32-bit members, which means | + * that if you put it into a larger structure that also contains 64-bit | + * members, the larger structure may get different alignment on x86-32 | + * and x86-64, which you might want to avoid. I can't tell if this is | + * an actual problem here. ... In this case, I'm pretty sure that | + * sizeof(cr_hdr_task) on x86-32 is different from x86-64, since it | + * will be 32-bit aligned on x86-32." | + */ | + | +/* i387 structure seen from kernel/userspace */ | +#ifdef __KERNEL__ | +#include <asm/processor.h> | +#else | +#include <sys/user.h> | +#endif | + | +struct cr_hdr_head_arch { | + /* FIXME: add HAVE_HWFP */ | + | + __u16 has_fxsr; | + __u16 has_xsave; | + __u16 xstate_size; | + __u16 _pading; | +} __attribute__((aligned(8))); | + | +struct cr_hdr_thread { | + /* FIXME: restart blocks */ | + | + __u16 gdt_entry_tls_entries; | + __u16 sizeof_tls_array; | + __u16 ntls; /* number of TLS entries to follow */ | +} __attribute__((aligned(8))); | + | +struct cr_hdr_cpu { | + /* see struct pt_regs (x86-64) */ | + __u64 r15; | + __u64 r14; | + __u64 r13; | + __u64 r12; | + __u64 bp; | + __u64 bx; | + __u64 r11; | + __u64 r10; | + __u64 r9; | + __u64 r8; | + __u64 ax; | + __u64 cx; | + __u64 dx; | + __u64 si; | + __u64 di; | + __u64 orig_ax; | + __u64 ip; | + __u64 cs; | + __u64 flags; | + __u64 sp; | + __u64 ss; | + | + /* segment registers */ | + __u64 ds; | + __u64 es; | + __u64 fs; | + __u64 gs; | + | + /* debug registers */ | + __u64 debugreg0; | + __u64 debugreg1; | + __u64 debugreg2; | + __u64 debugreg3; | + __u64 debugreg6; | + __u64 debugreg7; | + | + __u32 uses_debug; | + __u32 used_math; | + | + /* thread_xstate contents follow (if used_math) */ | +} __attribute__((aligned(8))); | + | +#endif /* __ASM_X86_CKPT_HDR__H */ | diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile | index d8cc96a..e1cb5f8 100644 | --- a/arch/x86/mm/Makefile | +++ b/arch/x86/mm/Makefile | @@ -17,3 +17,5 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | | obj-$(CONFIG_MEMTEST) += memtest.o | + | +obj-$(CONFIG_CHECKPOINT) += checkpoint.o restart.o | diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c | new file mode 100644 | index 0000000..946fac1 | --- /dev/null | +++ b/arch/x86/mm/checkpoint.c | @@ -0,0 +1,242 @@ | +/* | + * Checkpoint/restart - architecture specific support for x86 | + * | + * Copyright (C) 2008-2009 Oren Laadan | + * | + * This file is subject to the terms and conditions of the GNU General Public | + * License. See the file COPYING in the main directory of the Linux | + * distribution for more details. | + */ | + | +#include <asm/desc.h> | +#include <asm/i387.h> | + | +#include <linux/checkpoint.h> | +#include <linux/checkpoint_hdr.h> | + | +/* dump the thread_struct of a given task */ | +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t) | +{ | + struct cr_hdr h; | + struct cr_hdr_thread *hh; | + struct thread_struct *thread; | + struct desc_struct *desc; | + int ntls = 0; | + int n, ret; | + | + h.type = CR_HDR_THREAD; | + h.len = sizeof(*hh); | + | + hh = cr_hbuf_get(ctx, sizeof(*hh)); | + if (!hh) | + return -ENOMEM; | + | + thread = &t->thread; | + | + /* calculate no. of TLS entries that follow */ | + desc = thread->tls_array; | + for (n = GDT_ENTRY_TLS_ENTRIES; n > 0; n--, desc++) { | + if (desc->a || desc->b) | + ntls++; | + } | + | + hh->gdt_entry_tls_entries = GDT_ENTRY_TLS_ENTRIES; | + hh->sizeof_tls_array = sizeof(thread->tls_array); | + hh->ntls = ntls; | + | + ret = cr_write_obj(ctx, &h, hh); | + cr_hbuf_put(ctx, sizeof(*hh)); | + if (ret < 0) | + return ret; | + | + cr_debug("ntls %d\n", ntls); | + if (ntls == 0) | + return 0; | + | + /* | + * For simplicity dump the entire array, cherry-pick upon restart | + * FIXME: the TLS descriptors in the GDT should be called out and | + * not tied to the in-kernel representation. | + */ | + ret = cr_kwrite(ctx, thread->tls_array, sizeof(thread->tls_array)); | + | + /* IGNORE RESTART BLOCKS FOR NOW ... */ | + | + return ret; | +} | + | +#ifdef CONFIG_X86_64 | + | +#error "CONFIG_X86_64 unsupported yet." | + | +#else /* !CONFIG_X86_64 */ | + | +static void cr_save_cpu_regs(struct cr_hdr_cpu *hh, struct task_struct *t) | +{ | + struct thread_struct *thread = &t->thread; | + struct pt_regs *regs = task_pt_regs(t); | + | + hh->bp = regs->bp; | + hh->bx = regs->bx; | + hh->ax = regs->ax; | + hh->cx = regs->cx; | + hh->dx = regs->dx; | + hh->si = regs->si; | + hh->di = regs->di; | + hh->orig_ax = regs->orig_ax; | + hh->ip = regs->ip; | + hh->cs = regs->cs; | + hh->flags = regs->flags; | + hh->sp = regs->sp; | + hh->ss = regs->ss; | + | + hh->ds = regs->ds; | + hh->es = regs->es; | + | + /* | + * for checkpoint in process context (from within a container) | + * the GS and FS registers should be saved from the hardware; | + * otherwise they are already sabed on the thread structure | + */ | + if (t == current) { | + savesegment(gs, hh->gs); | + savesegment(fs, hh->fs); | + } else { | + hh->gs = thread->gs; | + hh->fs = thread->fs; | + } | + | + /* | + * for checkpoint in process context (from within a container), | + * the actual syscall is taking place at this very moment; so | + * we (optimistically) subtitute the future return value (0) of | + * this syscall into the orig_eax, so that upon restart it will | + * succeed (or it will endlessly retry checkpoint...) | + */ | + if (t == current) { | + BUG_ON(hh->orig_ax < 0); | + hh->ax = 0; | + } | +} | + | +static void cr_save_cpu_debug(struct cr_hdr_cpu *hh, struct task_struct *t) | +{ | + struct thread_struct *thread = &t->thread; | + | + /* debug regs */ | + | + /* | + * for checkpoint in process context (from within a container), | + * get the actual registers; otherwise get the saved values. | + */ | + | + if (t == current) { | + get_debugreg(hh->debugreg0, 0); | + get_debugreg(hh->debugreg1, 1); | + get_debugreg(hh->debugreg2, 2); | + get_debugreg(hh->debugreg3, 3); | + get_debugreg(hh->debugreg6, 6); | + get_debugreg(hh->debugreg7, 7); | + } else { | + hh->debugreg0 = thread->debugreg0; | + hh->debugreg1 = thread->debugreg1; | + hh->debugreg2 = thread->debugreg2; | + hh->debugreg3 = thread->debugreg3; | + hh->debugreg6 = thread->debugreg6; | + hh->debugreg7 = thread->debugreg7; | + } | + | + hh->uses_debug = !!(task_thread_info(t)->flags & TIF_DEBUG); | +} | + | +static void cr_save_cpu_fpu(struct cr_hdr_cpu *hh, struct task_struct *t) | +{ | + hh->used_math = tsk_used_math(t) ? 1 : 0; | +} | + | +static int cr_write_cpu_fpu(struct cr_ctx *ctx, struct task_struct *t) | +{ | + void *xstate_buf = cr_hbuf_get(ctx, xstate_size); | + int ret; | + | + /* i387 + MMU + SSE logic */ | + preempt_disable(); /* needed it (t == current) */ | + | + /* | + * normally, no need to unlazy_fpu(), since TS_USEDFPU flag | + * have been cleared when task was context-switched out... Nit: s/have been/was/ | + * except if we are in process context, in which case we do | + */ | + if (t == current && (task_thread_info(t)->status & TS_USEDFPU)) | + unlazy_fpu(current); | + | + /* | + * For simplicity dump the entire structure. | + * FIXME: need to be deliberate about what registers we are | + * dumping for traceability and compatibility. | + */ | + memcpy(xstate_buf, t->thread.xstate, xstate_size); | + preempt_enable(); /* needed it (t == current) */ Nit: s/it/if/ _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers