From: Tatsiana Brouka <Tatsiana_Brouka@xxxxxxxx> Implement system call for bulk retrieveing of pids in binary form. Using /proc is slower than necessary: 3 syscalls + another 3 for each thread + converting with atoi(). /proc may be not mounted especially in containers. Natural extension of hidepid=2 efforts is to not mount /proc at all. It could be used by programs like ps, top or CRIU. Speed increase will become more drastic once combined with bulk retrieval of process statistics. Sample program: #include <stdio.h> static inline long sys_pidmap(int *pid, unsigned int n, int start) { register long r10 asm ("r10") = 0; long rv; asm volatile ( "syscall" : "=a" (rv) : "0" (333), "D" (pid), "S" (n), "d" (start), "r" (r10) : "rcx", "r11", "cc", "memory" ); return rv; } int main(void) { int pid[5]; unsigned int start; int n; start = 0; while ((n = sys_pidmap(pid, sizeof(pid)/sizeof(pid[0]), start)) > 0) { int i; for (i = 0; i < n; i++) { printf(" %u", pid[i]); } printf("\n"); start = pid[n - 1] + 1; } return 0; } Signed-off-by: Tatsiana Brouka <Tatsiana_Brouka@xxxxxxxx> Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx> --- arch/x86/entry/syscalls/syscall_64.tbl | 1 include/linux/syscalls.h | 4 kernel/Makefile | 2 kernel/pidmap.c | 116 ++++++++++++++ tools/testing/selftests/Makefile | 1 tools/testing/selftests/pidmap/Makefile | 5 tools/testing/selftests/pidmap/pidmap.c | 263 ++++++++++++++++++++++++++++++++ 7 files changed, 392 insertions(+) --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -339,6 +339,7 @@ 330 common pkey_alloc sys_pkey_alloc 331 common pkey_free sys_pkey_free 332 common statx sys_statx +333 common pidmap sys_pidmap # # x32-specific system call numbers start at 512 to avoid cache impact --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -923,4 +923,8 @@ asmlinkage long sys_pkey_free(int pkey); asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); +asmlinkage long sys_pidmap(int __user *pids, + unsigned int pids_count, + unsigned int start_pid, + int flags); #endif --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,8 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-y += pidmap.o + obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER --- /dev/null +++ b/kernel/pidmap.c @@ -0,0 +1,116 @@ +#include <linux/bitops.h> +#include <linux/cred.h> +#include <linux/kernel.h> +#include <linux/pid.h> +#include <linux/ptrace.h> +#include <linux/rcupdate.h> +#include <linux/syscalls.h> +#include <linux/sched.h> +#include <linux/uaccess.h> + +/** + * pidmap - get allocated PIDs + * @pids: Destination buffer. + * @pids_count: number of elements in the buffer. + * @start_pid: PID to start from. + * @flags: flags, must be 0. + * + * Write allocated PIDs to a buffer starting from @start_pid (inclusive). + * PIDs are filled from pid namespace of the calling process POV: + * unshare(CLONE_NEWPID)+fork+pidmap in child will always return 1/1. + * + * pidmap(2) hides PIDs inaccessible at /proc mounted with "hide_pid" option. + * + * Note, pidmap(2) does not guarantee that any of returned PID exists + * by the time system call exits. + * + * Return: number of PIDs written to the buffer or error code otherwise. + */ +SYSCALL_DEFINE4(pidmap, int __user *, pids, unsigned int, pids_count, + unsigned int, start_pid, int, flags) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + unsigned int start_page, start_elem; + unsigned int last_pos = 0; + unsigned int last_set_pid = 0; + unsigned long mask; + bool has_perms = false; + unsigned int i; + + if (flags) + return -EINVAL; + + /* + * Pid 0 does not exist, however, corresponding bit is always set in + * ->pidmap[0].page, so we should skip it. + */ + if (start_pid == 0) + start_pid = 1; + + if (start_pid > ns->last_pid) + return 0; + + if (ns->hide_pid < HIDEPID_INVISIBLE || in_group_p(ns->pid_gid)) + has_perms = true; + + start_page = start_pid / BITS_PER_PAGE; + start_elem = (start_pid % BITS_PER_PAGE) / BITS_PER_LONG; + mask = ~0UL << (start_pid % BITS_PER_LONG); + + for (i = start_page; i < PIDMAP_ENTRIES; i++) { + unsigned int j; + + /* + * ->pidmap[].page is set once to a valid pointer, + * therefore do not take any locks. + */ + if (ns->pidmap[i].page == NULL) + continue; + + for (j = start_elem; j < PAGE_SIZE/sizeof(unsigned long); j++) { + unsigned long val; + + val = *((unsigned long *)ns->pidmap[i].page + j); + val &= mask; + mask = ~0UL; + while (val != 0) { + struct task_struct *task; + + if (last_pos == pids_count) + return last_pos; + + last_set_pid = i * BITS_PER_PAGE + + j * BITS_PER_LONG + __ffs(val); + + if (has_perms) + goto write; + + rcu_read_lock(); + task = find_task_by_pid_ns(last_set_pid, ns); + if (!task) { + rcu_read_unlock(); + goto next; + } + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { + rcu_read_unlock(); + goto next; + } + rcu_read_unlock(); +write: + if (put_user(last_set_pid, pids + last_pos)) + return -EFAULT; + last_pos++; + if (last_set_pid == ns->last_pid) + return last_pos; +next: + val &= (val - 1); + } + + } + start_elem = 0; + } + if (last_set_pid == 0) + return 0; + else + return last_pos; +} --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -20,6 +20,7 @@ TARGETS += mount TARGETS += mqueue TARGETS += net TARGETS += nsfs +TARGETS += pidmap TARGETS += powerpc TARGETS += pstore TARGETS += ptrace --- /dev/null +++ b/tools/testing/selftests/pidmap/Makefile @@ -0,0 +1,5 @@ +CFLAGS = -Wall + +TEST_GEN_PROGS := pidmap + +include ../lib.mk --- /dev/null +++ b/tools/testing/selftests/pidmap/pidmap.c @@ -0,0 +1,263 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/wait.h> +#include <stdlib.h> +#include <sched.h> +#include <dirent.h> +#include <string.h> +#include <sys/mount.h> +#include <signal.h> +#include "../kselftest_harness.h" + +#define SIZE 512 + +static inline long pidmap(int *pid, unsigned int count, unsigned int start_pid) +{ + long ret; + + register long r10 asm("r10") = 0; + + asm volatile ("syscall" : "=a"(ret) : + "0"(333), "D"(pid), "S"(count), "d"(start_pid), "r"(r10) : + "rcx", "r11", "cc", "memory"); + return ret; +} + +static int compare(const void *a, const void *b) +{ + return *((int *)a) > *((int *)b); +} + +int pidmap_full(int **pid, unsigned int *res_count) +{ + int n; + int start_pid = 1; + *pid = (int *)malloc(SIZE * sizeof(int)); + *res_count = 0; + + while ((n = pidmap(*pid + *res_count, SIZE, start_pid)) > 0) { + *res_count += n; + *pid = (int *)realloc(*pid, (*res_count + SIZE) * sizeof(int)); + start_pid = (*pid)[*res_count - 1] + 1; + } + return n; +} + +int pidmap_proc(int **pid, unsigned int *n) +{ + DIR *dir = opendir("/proc"); + struct dirent *dirs; + + *n = 0; + *pid = NULL; + + while ((dirs = readdir(dir))) { + char dname[32] = ""; + DIR *task_dir; + + if (dirs->d_name[0] < '0' || dirs->d_name[0] > '9') + continue; + + strcpy(dname, "/proc/"); + strcat(dname, dirs->d_name); + strcat(dname, "/task"); + task_dir = opendir(dname); + + if (task_dir) { + struct dirent *task_dirs; + + while ((task_dirs = readdir(task_dir))) { + if (task_dirs->d_name[0] < '0' || + task_dirs->d_name[0] > '9') + continue; + + *pid = (int *)realloc(*pid, (*n + 1) * + sizeof(int)); + if (*pid == NULL) + return -1; + *(*pid + *n) = atoi(task_dirs->d_name); + *n += 1; + } + } else { + *pid = (int *)realloc(*pid, (*n + 1) * sizeof(int)); + if (*pid == NULL) + return -1; + *(*pid + *n) = atoi(dirs->d_name); + *n += 1; + } + closedir(task_dir); + } + closedir(dir); + return 0; +} + +TEST(bufsize) +{ + int pid[SIZE]; + + EXPECT_EQ(0, pidmap(pid, 0, 1)); +} + +TEST(get_pid) +{ + int pid; + int ret; + + ret = pidmap(&pid, 1, getpid()); + ASSERT_LE(0, ret); + EXPECT_EQ(getpid(), pid); +} + +TEST(bad_start) +{ + int pid[SIZE]; + + ASSERT_LE(0, pidmap(pid, SIZE, -1)); + ASSERT_LE(0, pidmap(pid, SIZE, ~0U)); + ASSERT_LE(0, pidmap(pid, SIZE, 0)); + EXPECT_EQ(1, pid[0]); +} + +TEST(child_pid) +{ + pid_t pid = fork(); + + if (pid == 0) + pause(); + else { + int ret; + int result = 0; + + ret = pidmap(&result, 1, pid); + EXPECT_LE(0, ret); + EXPECT_EQ(pid, result); + kill(pid, SIGTERM); + } +} + +int write_pidmax(int new_pidmax) +{ + char old_pidmax[32]; + char new[32]; + int fd = open("/proc/sys/kernel/pid_max", O_RDWR); + + if (read(fd, old_pidmax, 32) <= 0) + printf("Read failed\n"); + lseek(fd, 0, 0); + snprintf(new, sizeof(new), "%d", new_pidmax); + if (write(fd, new, strlen(new)) <= 0) + printf("Write failed\n"); + close(fd); + return atoi(old_pidmax); +} + +void do_forks(unsigned int n) +{ + while (n--) { + pid_t pid = fork(); + + if (pid == 0) + exit(0); + waitpid(pid, NULL, 0); + } +} + +TEST(pid_max) +{ + int *pid; + unsigned int n; + int ret, p; + int old_pidmax; + + old_pidmax = write_pidmax(50000); + + do_forks(40000); + + p = fork(); + + if (p == 0) + pause(); + + ret = pidmap_full(&pid, &n); + + EXPECT_LE(0, ret); + EXPECT_EQ(p, pid[n - 1]); + + kill(p, SIGKILL); + write_pidmax(old_pidmax); +} + +TEST(compare_proc) +{ + pid_t pid; + + if (unshare(CLONE_NEWNS | CLONE_NEWPID) == -1) + return; + + pid = fork(); + + if (pid == 0) { + pid_t pid; + int i = 0; + + mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL); + mount("none", "/proc", NULL, MS_REC | MS_PRIVATE, NULL); + mount("proc", "/proc", "proc", + MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL); + + while (i < 150) { + i++; + + pid = fork(); + + if (pid == -1) { + wait(NULL); + umount("/proc"); + return; + } + if (pid == 0) { + pause(); + return; + } + } + + int *pids, *pids_proc; + unsigned int n = 0; + unsigned int n_proc = 0; + int ret, ret_proc; + + ret = pidmap_full(&pids, &n); + + ret_proc = pidmap_proc(&pids_proc, &n_proc); + qsort(pids_proc, n_proc, sizeof(int), compare); + + EXPECT_LE(0, ret); + EXPECT_EQ(n_proc, n); + + if (ret <= 0 || ret_proc <= 0 || n != n_proc) { + killpg(0, SIGTERM); + wait(NULL); + umount("/proc"); + free(pids); + free(pids_proc); + return; + } + + for (int i = 0; i < n; i++) { + EXPECT_EQ(pids_proc[i], pids[i]); + if (pids_proc[i] != pids[i]) + break; + } + EXPECT_EQ(1, pids[0]); + + free(pids_proc); + free(pids); + killpg(0, SIGTERM); + wait(NULL); + umount("/proc"); + } +} + +TEST_HARNESS_MAIN -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html