On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@xxxxxx> wrote: > > Only the tasks belonging to "current" pid namespace > are enumerated. > > For task/file target, the bpf program will have access to > struct task_struct *task > u32 fd > struct file *file > where fd/file is an open file for the task. > > Signed-off-by: Yonghong Song <yhs@xxxxxx> > --- I might be missing some subtleties with task refcounting for task_file iterator, asked few questions below... > kernel/bpf/Makefile | 2 +- > kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 337 insertions(+), 1 deletion(-) > create mode 100644 kernel/bpf/task_iter.c > > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index b2b5eefc5254..37b2d8620153 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -2,7 +2,7 @@ > obj-y := core.o > CFLAGS_core.o += $(call cc-disable-warning, override-init) > > -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o > +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o > obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o > obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o > obj-$(CONFIG_BPF_SYSCALL) += disasm.o > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > new file mode 100644 > index 000000000000..1ca258f6e9f4 > --- /dev/null > +++ b/kernel/bpf/task_iter.c > @@ -0,0 +1,336 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright (c) 2020 Facebook */ > + > +#include <linux/init.h> > +#include <linux/namei.h> > +#include <linux/pid_namespace.h> > +#include <linux/fs.h> > +#include <linux/fdtable.h> > +#include <linux/filter.h> > + > +struct bpf_iter_seq_task_common { > + struct pid_namespace *ns; > +}; > + > +struct bpf_iter_seq_task_info { > + struct bpf_iter_seq_task_common common; you have comment below in init_seq_pidns() that common is supposed to be the very first field, but I think it's more important and appropriate here, so that whoever adds anything here knows that order of field is important. > + struct task_struct *task; > + u32 id; > +}; > + [...] > +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop) > +{ > + struct bpf_iter_meta meta; > + struct bpf_iter__task ctx; > + struct bpf_prog *prog; > + int ret = 0; > + > + meta.seq = seq; > + prog = bpf_iter_get_info(&meta, in_stop); > + if (prog) { nit: `if (!prog) return 0;` here would reduce nesting level below > + meta.seq = seq; > + ctx.meta = &meta; > + ctx.task = v; > + ret = bpf_iter_run_prog(prog, &ctx); > + } > + > + return 0; return **ret**; ? > +} > + [...] > + > +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id, > + int *fd, struct task_struct **task, > + struct files_struct **fstruct) > +{ > + struct files_struct *files; > + struct task_struct *tk; > + u32 sid = *id; > + int sfd; > + > + /* If this function returns a non-NULL file object, > + * it held a reference to the files_struct and file. > + * Otherwise, it does not hold any reference. > + */ > +again: > + if (*fstruct) { > + files = *fstruct; > + sfd = *fd; > + } else { > + tk = task_seq_get_next(ns, &sid); > + if (!tk) > + return NULL; > + > + files = get_files_struct(tk); > + put_task_struct(tk); task is put here, but is still used below.. is there some additional hidden refcounting involved? > + if (!files) { > + sid = ++(*id); > + *fd = 0; > + goto again; > + } > + *fstruct = files; > + *task = tk; > + if (sid == *id) { > + sfd = *fd; > + } else { > + *id = sid; > + sfd = 0; > + } > + } > + > + rcu_read_lock(); > + for (; sfd < files_fdtable(files)->max_fds; sfd++) { files_fdtable does rcu_dereference on each iteration, would it be better to just cache files_fdtable(files)->max_fds into local variable? It's unlikely that there will be many iterations, but still... > + struct file *f; > + > + f = fcheck_files(files, sfd); > + if (!f) > + continue; > + *fd = sfd; > + get_file(f); > + rcu_read_unlock(); > + return f; > + } > + > + /* the current task is done, go to the next task */ > + rcu_read_unlock(); > + put_files_struct(files); > + *fstruct = NULL; *task = NULL; for completeness? > + sid = ++(*id); > + *fd = 0; > + goto again; > +} > + > +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) > +{ > + struct bpf_iter_seq_task_file_info *info = seq->private; > + struct files_struct *files = NULL; > + struct task_struct *task = NULL; > + struct file *file; > + u32 id = info->id; > + int fd = info->fd; > + > + file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files); > + if (!file) { > + info->files = NULL; what about info->task here? > + return NULL; > + } > + > + ++*pos; > + info->id = id; > + info->fd = fd; > + info->task = task; > + info->files = files; > + > + return file; > +} > + [...] > + > +struct bpf_iter__task_file { > + __bpf_md_ptr(struct bpf_iter_meta *, meta); > + __bpf_md_ptr(struct task_struct *, task); > + u32 fd; nit: sort of works by accident (due to all other field being 8-byte aligned pointers), shall we add __attribute__((aligned(8)))? > + __bpf_md_ptr(struct file *, file); > +}; > + [...]