Each process have different pids, one for each pid namespace it belongs. When interaction happens within single pid-ns translation isn't required. More complicated scenarios needs special handling. For example: - reading pid-files or logs written inside container with pid namespace - attaching with ptrace to tasks from different pid namespace - passing pids across pid namespaces in any kind of API Currently there are several interfaces that could be used here: Pid namespaces are identified by inode number of /proc/[pid]/ns/pid. Pids for nested Pid namespaces are shown in file /proc/[pid]/status. In some cases conversion pid -> vpid could be easily done using this information, but backward translation requires scanning all tasks. Unix socket automatically translates pid attached to SCM_CREDENTIALS. This requires CAP_SYS_ADMIN for sending arbitrary pids and entering into pid namespace, this expose process and could be insecure. This patch adds new syscall for converting pids between pid namespaces: pid_t translate_pid(pid_t pid, int source_type, int source, int target_type, int target); @source_type and @target_type defines type of following arguments: TRANSLATE_PID_CURRENT_PIDNS - current pid namespace, argument is unused TRANSLATE_PID_TASK_PIDNS - task pid-ns, argument is task pid TRANSLATE_PID_FD_PIDNS - pidns fd, argument is file descriptor Syscall returns pid in target pid-ns or zero if task have no pid there. Error codes: -EINVAL - @source or @target couldn't be resolved into pid namespace -ESRCH - task with @pid is not found in @source pid-namespace Other pid namespaces are referenced either by pid of any process who lives inside it or by file descriptor pointing to /proc/[pid]/ns/pid. Latter method provides better protection against races but in some cases requires CAP_SYS_PTRACE. Translate_pid could breach pid isolation and return pids from outer pid namespaces iff process already has file descriptor pointing to them. Examples: - get pid in current pid namespace translate_pid(pid, TRANSLATE_PID_FD_PIDNS, ns_fd, TRANSLATE_PID_CURRENT_PIDNS, 0) or translate_pid(pid, TRANSLATE_PID_TASK_PIDNS, ns_pid, TRANSLATE_PID_CURRENT_PIDNS, 0) - get pid in other pid namespace translate_pid(pid, TRANSLATE_PID_CURRENT_PIDNS, 0, TRANSLATE_PID_FD_PIDNS, ns_fd) or translate_pid(pid, TRANSLATE_PID_CURRENT_PIDNS, 0, TRANSLATE_PID_TASK_PIDNS, ns_pid) - get deepest pid translate_pid(pid, TRANSLATE_PID_CURRENT_PIDNS, 0, TRANSLATE_PID_TASK_PIDNS, pid) - get pid of init task for namespace translate_pid(1, TRANSLATE_PID_FD_PIDNS, ns_fd, TRANSLATE_PID_CURRENT_PIDNS, 0) This syscall also could be used for checking topology of pid namespaces: - ns1 nests inside ns2 translate_pid(1, TRANSLATE_PID_FD_PIDNS, ns1_fd, TRANSLATE_PID_FD_PIDNS, ns2_fd) > 1 - task1 lives in same pid-namespace as task2 translate_pid(1, TRANSLATE_PID_TASK_PIDNS, task1_pid, TRANSLATE_PID_TASK_PIDNS, task2_pid) == 1 - task1 is isolated from task2 translate_pid(task1_pid, TRANSLATE_PID_CURRENT_PIDNS, 0, TRANSLATE_PID_TASK_PIDNS, task2_pid) == 0 - pid is reachable from ns translate_pid(pid, TRANSLATE_PID_CURRENT_PIDNS, 0, TRANSLATE_PID_FD_PIDNS, ns_fd) > 0 Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx> --- v1: https://lkml.org/lkml/2015/9/15/411 v2: https://lkml.org/lkml/2015/9/24/278 * use namespace-fd as second/third argument * add -pid for getting parent pid * move code into kernel/sys.c next to getppid * drop ifdef CONFIG_PID_NS * add generic syscall v3: https://lkml.org/lkml/2015/9/28/3 * use proc_ns_fdget() * update description * rebase to next-20150925 * fix conflict with mlock2 v4: https://lkml.org/lkml/2017/10/16/852 * rename into translate_pid() * remove syscall if CONFIG_PID_NS=n * drop -pid for parent task * drop fget-fdget optimizations * add helper get_pid_ns_by_fd() * wire only into x86 v5: * rewrite commit message * resolve pidns by task pid or by pidns fd * add arguments source_type and target_type --- sample tool translate_pid.c --- #define _GNU_SOURCE #include <sys/syscall.h> #include <sys/types.h> #include <sys/stat.h> #include <sched.h> #include <fcntl.h> #include <err.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> #ifndef SYS_translate_pid #ifdef __x86_64__ #define SYS_translate_pid 333 #endif #endif #ifndef TRANSLATE_PID_CURRENT_PIDNS #define TRANSLATE_PID_CURRENT_PIDNS 0 #define TRANSLATE_PID_TASK_PIDNS 1 #define TRANSLATE_PID_FD_PIDNS 2 #endif pid_t translate_pid(pid_t pid, int source_type, int source, int target_type, int target) { return syscall(SYS_translate_pid, pid, source_type, source, target_type, target); } int main(int argc, char **argv) { int pid, source, target; char buf[64]; if (argc != 4) errx(1, "usage: %s <pid> <source> <traget>", argv[0]); pid = atoi(argv[1]); int source_type, target_type; source = atoi(argv[2]); target = atoi(argv[3]); if (source < 0) { source_type = TRANSLATE_PID_TASK_PIDNS; source = -source; } else if (source > 0) { source_type = TRANSLATE_PID_FD_PIDNS; sprintf(buf, "/proc/%d/ns/pid", source); source = open(buf, O_RDONLY); if (source < 0) err(2, "open source %s", buf); } else { source_type = TRANSLATE_PID_CURRENT_PIDNS; } if (target < 0) { target_type = TRANSLATE_PID_TASK_PIDNS; target = -target; } else if (target > 0) { target_type = TRANSLATE_PID_FD_PIDNS; sprintf(buf, "/proc/%d/ns/pid", target); target = open(buf, O_RDONLY); if (target < 0) err(2, "open target %s", buf); } else { target_type = TRANSLATE_PID_CURRENT_PIDNS; } pid = translate_pid(pid, source_type, source, target_type, target); if (pid < 0) err(2, "translate"); printf("%d\n", pid); return 0; } --- --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 4 ++ include/uapi/linux/sched.h | 7 ++++ kernel/pid_namespace.c | 64 ++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 3 ++ 6 files changed, 80 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c58f75b088c5..aef52c709845 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -391,3 +391,4 @@ 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +385 i386 translate_pid sys_translate_pid diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5aef183e2f85..1ebdab83c6f4 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330 common pkey_alloc sys_pkey_alloc 331 common pkey_free sys_pkey_free 332 common statx sys_statx +333 common translate_pid sys_translate_pid # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b961184f597a..d189a1f61160 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -553,6 +553,10 @@ asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags, /* kernel/printk.c */ asmlinkage long sys_syslog(int type, char __user *buf, int len); +/* kernel/pid_namespace.c */ +asmlinkage long sys_translate_pid(pid_t pid, int source_type, int source, + int target_type, int target); + /* kernel/ptrace.c */ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, unsigned long data); diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 22627f80063e..7c45fd8d33d7 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -55,4 +55,11 @@ SCHED_FLAG_RECLAIM | \ SCHED_FLAG_DL_OVERRUN) +/* + * For translate_pid() + */ +#define TRANSLATE_PID_CURRENT_PIDNS 0 /* Current pid namespace */ +#define TRANSLATE_PID_TASK_PIDNS 1 /* Namespace by task pid */ +#define TRANSLATE_PID_FD_PIDNS 2 /* Namespace by pidns fd */ + #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 2a2ac53d8b8b..84c8b47289d5 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -13,6 +13,7 @@ #include <linux/user_namespace.h> #include <linux/syscalls.h> #include <linux/cred.h> +#include <linux/file.h> #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> @@ -380,6 +381,69 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +/* Under rcu_read_lock(). Returns pointer to pid_namespace or NULL. */ +static struct pid_namespace *resolve_pid_ns(int type, int fd_or_pid) +{ + struct pid_namespace *current_ns = task_active_pid_ns(current); + struct pid_namespace *pidns = NULL; + struct ns_common *ns; + struct file *file; + + switch (type) { + case TRANSLATE_PID_CURRENT_PIDNS: + pidns = current_ns; + break; + case TRANSLATE_PID_TASK_PIDNS: + pidns = ns_of_pid(find_pid_ns(fd_or_pid, current_ns)); + break; + case TRANSLATE_PID_FD_PIDNS: + file = proc_ns_fget(fd_or_pid); + if (!IS_ERR(file)) { + ns = get_proc_ns(file_inode(file)); + if (ns->ops->type == CLONE_NEWPID) + pidns = to_pid_ns(ns); + fput(file); + } + break; + } + + return pidns; +} + +/* + * translate_pid - convert pid in source pid-ns into target pid-ns. + * @pid: pid for translation + * @source_type: one of TRANSLATE_PID_* + * @source: depending on @source_type pid-ns fd, pid, or nothing + * @target_type: one of TRANSLATE_PID_* + * @target: depending on @target_type pid-ns fd, pid, or nothing + * + * Returns pid in @target pid-ns, zero if task have no pid there, + * or -ESRCH if task with @pid does not found in @source pid-ns, + * or -EINVAL if @source or @target couldn't be resolved into pid-ns. + */ +SYSCALL_DEFINE5(translate_pid, pid_t, pid, + int, source_type, int, source, + int, target_type, int, target) +{ + struct pid_namespace *source_ns, *target_ns; + struct pid *struct_pid; + pid_t result = -EINVAL; + + rcu_read_lock(); + source_ns = resolve_pid_ns(source_type, source); + if (!source_ns) + goto out; + target_ns = resolve_pid_ns(target_type, target); + if (!target_ns) + goto out; + struct_pid = find_pid_ns(pid, source_ns); + result = struct_pid ? pid_nr_ns(struct_pid, target_ns) : -ESRCH; +out: + rcu_read_unlock(); + return result; +} + static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6cafc008f6db..777689bce406 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -146,6 +146,9 @@ COND_SYSCALL(delete_module); /* kernel/printk.c */ COND_SYSCALL(syslog); +/* kernel/pid_namespace.c */ +COND_SYSCALL(sys_translate_pid); + /* kernel/ptrace.c */ /* kernel/sched/core.c */ -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html