Hi, I am working on adding support for cgroup namespace on docker recently, since `setns()` for cgroup namespace no longer requires the process to first move under the target cgroupns-root, I accidentally found that if two processes share cgroup namespace (the first process’s cgroup-root is `/p1` while the second’s is `../p2`) but join different pid namespaces, then the second process’s `/sys/fs/cgroup/<subsystem>/tasks` file would be empty. Here is an example: In session 1: # mkdir -p /sys/fs/cgroup/freezer/p1 # echo $$ 110413 # echo 110413 > /sys/fs/cgroup/freezer/p1/tasks # cat /proc/self/cgroup | grep freezer 7:freezer:/p1 Next, we use `unshare` to create a process running a new shell in new cgroup, pid and mount namespaces: # unshare –C –m –p bash # cat /proc/self/cgroup | grep freezer 7:freezer:/ # cat /proc/self/mountinfo | grep freezer 308 301 0:31 /.. /sys/fs/cgroup/freezer rw,relatime - cgroup cgroup rw,freezer Now, we remount the freezer cgroup filesystem inside this cgroup namespace: # mount --make-rslave / # umount /sys/fs/cgroup/freezer # mount -t cgroup -o freezer freezer /sys/fs/cgroup/freezer # cat /proc/self/mountinfo | grep freezer 308 301 0:31 / /sys/fs/cgroup/freezer rw,relatime - cgroup freezer rw,freezer # cat /sys/fs/cgroup/freezer/tasks 1 371 In session 2: # mkdir -p /sys/fs/cgroup/freezer/p2 # echo $$ 110613 # echo 110613 > /sys/fs/cgroup/freezer/p2/tasks # cat /proc/self/cgroup | grep freezer 7:freezer:/p2 Next, we run the program shown below, using it to execute a shell in new pid and mount namespaces, but shares cgroup namespace with the above new bash progress: # ./test # cat /proc/self/cgroup | grep freezer 7:freezer:/../p2 # cat /proc/self/mountinfo | grep freezer 360 353 0:31 /.. /sys/fs/cgroup/freezer rw,relatime - cgroup cgroup rw,freezer Also, we remount the freezer cgroup filesystem inside this cgroup namespace: # mount --make-rslave / # umount /sys/fs/cgroup/freezer # mount -t cgroup -o freezer freezer /sys/fs/cgroup/freezer # cat /proc/self/mountinfo | grep freezer 360 353 0:31 / /sys/fs/cgroup/freezer rw,relatime - cgroup freezer rw,freezer # ls /sys/fs/cgroup/freezer/ cgroup.clone_children cgroup.procs freezer.parent_freezing freezer.self_freezing freezer.state notify_on_release tasks # cat /sys/fs/cgroup/freezer/tasks # (nothing) I have also tried to let the two processes share pid namespace, then the second new bash process’s `/sys/fs/cgroup/freezer/tasks` file would be the same as the first one. Moreover, if I move the second process under the first process’s cgroupns-root(i.e `/p1/p2`), then its `tasks` file will contain expected pids: # mkdir -p /sys/fs/cgroup/freezer/p1/p2 # echo $$ 110766 # echo 110766 > /sys/fs/cgroup/freezer/p1/p2/tasks # cat /proc/self/cgroup 7:freezer:/p1/p2 # ./test # cat /proc/self/cgroup | grep freezer 7:freezer:/p2 # cat /proc/self/mountinfo | grep freezer 360 353 0:31 /.. /sys/fs/cgroup/freezer rw,relatime - cgroup cgroup rw,freezer # mount --make-rslave / # umount /sys/fs/cgroup/freezer # mount -t cgroup -o freezer freezer /sys/fs/cgroup/freezer # ls /sys/fs/cgroup/freezer/ cgroup.clone_children cgroup.procs freezer.parent_freezing freezer.self_freezing freezer.state notify_on_release p2 tasks # cat /sys/fs/cgroup/freezer/p2/tasks 1 274 In a word, the conclusion is that if a process uses `setns()` to join another process’s cgroup namespace without being moved to the target cgroupns-root, then after we remount cgroupfs inside the cgroup namespace, the shared process’s `/sys/fs/cgroup/<subsystem>/tasks` file would be the same as the target process’s if the two process are in the same pid namespace, while this file would be empty if the two processes are in different pid namespaces (probably it’s because that the shared process cannot see the pid of the target process). So, is it an intended behavior or a bug? Or, if there is anything wrong with my operations above? Then, how can I mount cgroupfs correctly inside a shared cgroup namespace? Thanks in advance if anyone could help : ) Program source: #define _GNU_SOURCE #include <unistd.h> #include <sys/types.h> #include <linux/sched.h> #include <sys/mman.h> #include <stdio.h> #include <errno.h> #include <string.h> #include <sys/wait.h> #include <fcntl.h> #define STACK_SIZE 1024*1024*8 //8M int thread_func(void *lparam) { //120719 is the new bash process’s pid in session 1 int fd = open("/proc/120719/ns/cgroup", O_RDONLY); if (fd == -1) return 1; if (setns(fd, CLONE_NEWCGROUP) == -1) return 1; execl("/bin/bash", "bash", NULL); return 0; } int main(int argc, char **argv) { void *pstack = (void *)mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE , MAP_PRIVATE | MAP_ANONYMOUS | MAP_ANON , -1, 0); if (MAP_FAILED != pstack) { int ret; ret = clone(thread_func, (void *)((unsigned char *)pstack + STACK_SIZE), CLONE_NEWNS | CLONE_NEWPID, (void *)NULL); if (-1 != ret) { pid_t pid = 0; sleep(5); pid = waitpid(-1, NULL, __WCLONE | __WALL); printf("child : %d exit %s\n", pid,strerror(errno)); } else { printf("clone failed %s\n", strerror(errno) ); } } else { printf("mmap() failed %s\n", strerror(errno)); } return 0; } --- Yuanhong Peng _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers