Re: [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets

Yonghong Song <yhs@xxxxxx> · Wed, 6 May 2020 11:24:45 -0700

On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@xxxxxx> wrote:

Only the tasks belonging to "current" pid namespace
are enumerated.

For task/file target, the bpf program will have access to
   struct task_struct *task
   u32 fd
   struct file *file
where fd/file is an open file for the task.

Signed-off-by: Yonghong Song <yhs@xxxxxx>
---

I might be missing some subtleties with task refcounting for task_file
iterator, asked few questions below...

  kernel/bpf/Makefile    |   2 +-
  kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
  2 files changed, 337 insertions(+), 1 deletion(-)
  create mode 100644 kernel/bpf/task_iter.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index b2b5eefc5254..37b2d8620153 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
  obj-y := core.o
  CFLAGS_core.o += $(call cc-disable-warning, override-init)

-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..1ca258f6e9f4
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+
+struct bpf_iter_seq_task_common {
+       struct pid_namespace *ns;
+};
+
+struct bpf_iter_seq_task_info {
+       struct bpf_iter_seq_task_common common;

you have comment below in init_seq_pidns() that common is supposed to
be the very first field, but I think it's more important and
appropriate here, so that whoever adds anything here knows that order
of field is important.

I can move the comments here.


+       struct task_struct *task;
+       u32 id;
+};
+

[...]

+static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+       struct bpf_iter_meta meta;
+       struct bpf_iter__task ctx;
+       struct bpf_prog *prog;
+       int ret = 0;
+
+       meta.seq = seq;
+       prog = bpf_iter_get_info(&meta, in_stop);
+       if (prog) {


nit: `if (!prog) return 0;` here would reduce nesting level below

+               meta.seq = seq;
+               ctx.meta = &meta;
+               ctx.task = v;
+               ret = bpf_iter_run_prog(prog, &ctx);
+       }
+
+       return 0;

return **ret**; ?

It should return "ret". In task_file show() code is similar but correct.
I can do early return with !prog too although we do not have
deep nesting level yet.


+}
+

[...]

+
+static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
+                                          int *fd, struct task_struct **task,
+                                          struct files_struct **fstruct)
+{
+       struct files_struct *files;
+       struct task_struct *tk;
+       u32 sid = *id;
+       int sfd;
+
+       /* If this function returns a non-NULL file object,
+        * it held a reference to the files_struct and file.
+        * Otherwise, it does not hold any reference.
+        */
+again:
+       if (*fstruct) {
+               files = *fstruct;
+               sfd = *fd;
+       } else {
+               tk = task_seq_get_next(ns, &sid);
+               if (!tk)
+                       return NULL;
+
+               files = get_files_struct(tk);
+               put_task_struct(tk);

task is put here, but is still used below.. is there some additional
hidden refcounting involved?

Good question. I had an impression that we take a reference count
for task->files so task should not go away. But reading linux
code again, I do not have sufficient evidence to back my claim.
So I will reference count task as well, e.g., do not put_task_struct()
until all files are done here.


+               if (!files) {
+                       sid = ++(*id);
+                       *fd = 0;
+                       goto again;
+               }
+               *fstruct = files;
+               *task = tk;
+               if (sid == *id) {
+                       sfd = *fd;
+               } else {
+                       *id = sid;
+                       sfd = 0;
+               }
+       }
+
+       rcu_read_lock();
+       for (; sfd < files_fdtable(files)->max_fds; sfd++) {

files_fdtable does rcu_dereference on each iteration, would it be
better to just cache files_fdtable(files)->max_fds into local
variable? It's unlikely that there will be many iterations, but
still...

I borrowed code from fs/proc/fd.c. But I can certainly to avoid
repeated reading max_fds as suggested.


+               struct file *f;
+
+               f = fcheck_files(files, sfd);
+               if (!f)
+                       continue;
+               *fd = sfd;
+               get_file(f);
+               rcu_read_unlock();
+               return f;
+       }
+
+       /* the current task is done, go to the next task */
+       rcu_read_unlock();
+       put_files_struct(files);
+       *fstruct = NULL;

*task = NULL; for completeness?

if *fstruct == NULL, will try to get next task, so *task = NULL
is unnecessary, but I can add it, won't hurt and possibly make
it easy to understand.


+       sid = ++(*id);
+       *fd = 0;
+       goto again;
+}
+
+static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       struct bpf_iter_seq_task_file_info *info = seq->private;
+       struct files_struct *files = NULL;
+       struct task_struct *task = NULL;
+       struct file *file;
+       u32 id = info->id;
+       int fd = info->fd;
+
+       file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
+       if (!file) {
+               info->files = NULL;

what about info->task here?

info->files == NULL indicates the end of iteration, info->task will not 
be checked any more. But I guess, I can assign NULL to task as well to
avoid confusion.


+               return NULL;
+       }
+
+       ++*pos;
+       info->id = id;
+       info->fd = fd;
+       info->task = task;
+       info->files = files;
+
+       return file;
+}
+

[...]

+
+struct bpf_iter__task_file {
+       __bpf_md_ptr(struct bpf_iter_meta *, meta);
+       __bpf_md_ptr(struct task_struct *, task);
+       u32 fd;

nit: sort of works by accident (due to all other field being 8-byte
aligned pointers), shall we add __attribute__((aligned(8)))?

This is what I thought as well. It should work. But I think
add aligned(8) wont' hurt to expresss the intention.. Will add it.


+       __bpf_md_ptr(struct file *, file);
+};
+

[...]