Re: [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets

Yonghong Song <yhs@xxxxxx> · Wed, 6 May 2020 14:20:10 -0700

On 5/6/20 1:51 PM, Andrii Nakryiko wrote:
On Wed, May 6, 2020 at 11:24 AM Yonghong Song <yhs@xxxxxx> wrote:



On 5/6/20 12:30 AM, Andrii Nakryiko wrote:
On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@xxxxxx> wrote:

Only the tasks belonging to "current" pid namespace
are enumerated.

For task/file target, the bpf program will have access to
    struct task_struct *task
    u32 fd
    struct file *file
where fd/file is an open file for the task.

Signed-off-by: Yonghong Song <yhs@xxxxxx>
---

I might be missing some subtleties with task refcounting for task_file
iterator, asked few questions below...

   kernel/bpf/Makefile    |   2 +-
   kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
   2 files changed, 337 insertions(+), 1 deletion(-)
   create mode 100644 kernel/bpf/task_iter.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index b2b5eefc5254..37b2d8620153 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
   obj-y := core.o
   CFLAGS_core.o += $(call cc-disable-warning, override-init)

-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
   obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..1ca258f6e9f4
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+
+struct bpf_iter_seq_task_common {
+       struct pid_namespace *ns;
+};
+
+struct bpf_iter_seq_task_info {
+       struct bpf_iter_seq_task_common common;

you have comment below in init_seq_pidns() that common is supposed to
be the very first field, but I think it's more important and
appropriate here, so that whoever adds anything here knows that order
of field is important.

I can move the comments here.


+       struct task_struct *task;
+       u32 id;
+};
+

[...]

+static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+       struct bpf_iter_meta meta;
+       struct bpf_iter__task ctx;
+       struct bpf_prog *prog;
+       int ret = 0;
+
+       meta.seq = seq;
+       prog = bpf_iter_get_info(&meta, in_stop);
+       if (prog) {


nit: `if (!prog) return 0;` here would reduce nesting level below

+               meta.seq = seq;
+               ctx.meta = &meta;
+               ctx.task = v;
+               ret = bpf_iter_run_prog(prog, &ctx);
+       }
+
+       return 0;

return **ret**; ?

It should return "ret". In task_file show() code is similar but correct.
I can do early return with !prog too although we do not have
deep nesting level yet.


+}
+

[...]

+
+static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
+                                          int *fd, struct task_struct **task,
+                                          struct files_struct **fstruct)
+{
+       struct files_struct *files;
+       struct task_struct *tk;
+       u32 sid = *id;
+       int sfd;
+
+       /* If this function returns a non-NULL file object,
+        * it held a reference to the files_struct and file.
+        * Otherwise, it does not hold any reference.
+        */
+again:
+       if (*fstruct) {
+               files = *fstruct;
+               sfd = *fd;
+       } else {
+               tk = task_seq_get_next(ns, &sid);
+               if (!tk)
+                       return NULL;
+
+               files = get_files_struct(tk);
+               put_task_struct(tk);

task is put here, but is still used below.. is there some additional
hidden refcounting involved?

Good question. I had an impression that we take a reference count
for task->files so task should not go away. But reading linux
code again, I do not have sufficient evidence to back my claim.
So I will reference count task as well, e.g., do not put_task_struct()
until all files are done here.

All threads within the process share files table. So some threads
might exit, but files will stay, which is why task_struct and
files_struct have separate refcounting, and having refcount on files
doesn't guarantee any particular task will stay alive for long enough.
So I think we need to refcount both files and task in this case.
Reading source code of copy_files() in kernel/fork.c (CLONE_FILES
flags just bumps refcnt on old process' files_struct), seems to
confirm this as well.

Just checked the code. It does look like files are shared among
threads (tasks). So yes, in this case, reference counting to
both task and file_table needed.




+               if (!files) {
+                       sid = ++(*id);
+                       *fd = 0;
+                       goto again;
+               }
+               *fstruct = files;
+               *task = tk;
+               if (sid == *id) {
+                       sfd = *fd;
+               } else {
+                       *id = sid;
+                       sfd = 0;
+               }
+       }
+
+       rcu_read_lock();
[...]