Define a sysctl 'ckpt_unpriv_allowed' which determines whether all checkpoints and restarts require CAP_SYS_ADMIN. If it is 1, then regular permission checks are intended to prevent privilege escalation, but leaving it at 0 prevents unprivileged users from exploiting any privilege escalation bugs. Define a CHECKPOINT_SUBTREE flag for sys_checkpoint() which allows to checkpoint a subtree of processes. Otherwise, the syscall expects to checkpoint an entire container (in the sense of a pid namespace), starting with the container init task. Signed-off-by: Oren Laadan <orenl@xxxxxxxxxxxxxxx> --- checkpoint/checkpoint.c | 4 ++++ checkpoint/restart.c | 2 +- checkpoint/sys.c | 17 +++++++++++++++-- include/linux/checkpoint_types.h | 12 +++++++++++- kernel/sysctl.c | 19 +++++++++++++++++++ 5 files changed, 50 insertions(+), 4 deletions(-) diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c index 0299046..6305e5d 100644 --- a/checkpoint/checkpoint.c +++ b/checkpoint/checkpoint.c @@ -423,6 +423,10 @@ static int get_container(struct ckpt_ctx *ctx, pid_t pid) ctx->root_nsproxy = nsproxy; ctx->root_init = is_container_init(task); + /* FIX: does this error code makes sense here ? */ + if (!(ctx->flags & CHECKPOINT_SUBTREE) && !ctx->root_init) + return -EBUSY; + return 0; out: diff --git a/checkpoint/restart.c b/checkpoint/restart.c index edc89ba..e5a29fb 100644 --- a/checkpoint/restart.c +++ b/checkpoint/restart.c @@ -287,7 +287,7 @@ static int restore_read_header(struct ckpt_ctx *ctx) h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) || h->patch != ((LINUX_VERSION_CODE) & 0xff)) goto out; - if (h->flags & ~CKPT_CTX_CHECKPOINT) + if (h->flags & ~(CKPT_CTX_CHECKPOINT | CKPT_USER_FLAGS)) goto out; if (h->uts_release_len != sizeof(uts->release) || h->uts_version_len != sizeof(uts->version) || diff --git a/checkpoint/sys.c b/checkpoint/sys.c index a613748..e3f7012 100644 --- a/checkpoint/sys.c +++ b/checkpoint/sys.c @@ -21,6 +21,13 @@ #include <linux/checkpoint.h> /* + * ckpt_unpriv_allowed - sysctl_controlled, do not allow checkpoint of + * a set of tasks which do not form a fully isolated container, if 0. + */ +int ckpt_unpriv_allowed = 1; /* default: yes */ + + +/* * Helpers to write(read) from(to) kernel space to(from) the checkpoint * image file descriptor (similar to how a core-dump is performed). * @@ -296,10 +303,13 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags) struct ckpt_ctx *ctx; int ret; - /* no flags for now */ - if (flags) + /* check user flags */ + if (flags & ~CKPT_USER_FLAGS) return -EINVAL; + if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (pid == 0) pid = current->pid; ctx = ckpt_ctx_alloc(fd, flags | CKPT_CTX_CHECKPOINT); @@ -334,6 +344,9 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags) if (flags) return -EINVAL; + if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* FIXME: for now, we use 'crid' as a pid */ pid = (pid_t) crid; diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h index 85eb184..09d3238 100644 --- a/include/linux/checkpoint_types.h +++ b/include/linux/checkpoint_types.h @@ -10,6 +10,13 @@ * distribution for more details. */ +#define CKPT_VERSION 1 + +#define CHECKPOINT_SUBTREE 0x4 + + +#ifdef __KERNEL__ + struct ckpt_ctx; #include <linux/list.h> @@ -19,7 +26,6 @@ struct ckpt_ctx; #include <linux/sched.h> #include <asm/atomic.h> -#define CKPT_VERSION 1 struct ckpt_ctx { int crid; /* unique checkpoint id */ @@ -67,5 +73,9 @@ struct ckpt_ctx { #define CKPT_CTX_CHECKPOINT 0x1 #define CKPT_CTX_RESTART 0x2 +#define CKPT_USER_FLAGS (CHECKPOINT_SUBTREE) + + +#endif /* __KERNEL__ */ #endif /* _LINUX_CHECKPOINT_TYPES_H_ */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7d..21f9c48 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -192,6 +192,10 @@ int sysctl_legacy_va_layout; extern int prove_locking; extern int lock_stat; +#ifdef CONFIG_CHECKPOINT +extern int ckpt_unpriv_allowed; +#endif + /* The default sysctl tables: */ static struct ctl_table root_table[] = { @@ -910,6 +914,20 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif +#ifdef CONFIG_CHECKPOINT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ckpt_unpriv_allowed", + .data = &ckpt_unpriv_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1302,6 +1320,7 @@ static struct ctl_table vm_table[] = { .proc_handler = &scan_unevictable_handler, }, #endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- 1.5.4.3 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers