Re: [PATCH bpf-next v8 1/5] bpf: Parameterize task iterators.

Yonghong Song <yhs@xxxxxx> · Tue, 30 Aug 2022 19:37:40 -0700

On 8/30/22 5:35 PM, Kui-Feng Lee wrote:
On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote:


On 8/29/22 12:23 PM, Kui-Feng Lee wrote:
Allow creating an iterator that loops through resources of one
thread/process.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were
interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx>
Acked-by: Yonghong Song <yhs@xxxxxx>
---
   include/linux/bpf.h            |  25 +++++
   include/uapi/linux/bpf.h       |   6 ++
   kernel/bpf/task_iter.c         | 184
+++++++++++++++++++++++++++++----
   tools/include/uapi/linux/bpf.h |   6 ++
   4 files changed, 199 insertions(+), 22 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9c1674973e03..31ac2c1181f5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user
*pathname, int flags);
         extern int bpf_iter_ ## target(args);                   \
         int __init bpf_iter_ ## target(args) { return 0; }
   
+/*
+ * The task type of iterators.
+ *
+ * For BPF task iterators, they can be parameterized with various
+ * parameters to visit only some of tasks.
+ *
+ * BPF_TASK_ITER_ALL (default)
+ *     Iterate over resources of every task.
+ *
+ * BPF_TASK_ITER_TID
+ *     Iterate over resources of a task/tid.
+ *
+ * BPF_TASK_ITER_TGID
+ *     Iterate over resources of every task of a process / task
group.
+ */
+enum bpf_iter_task_type {
+       BPF_TASK_ITER_ALL = 0,
+       BPF_TASK_ITER_TID,
+       BPF_TASK_ITER_TGID,
+};
+
   struct bpf_iter_aux_info {
         /* for map_elem iter */
         struct bpf_map *map;
@@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info {
                 struct cgroup *start; /* starting cgroup */
                 enum bpf_cgroup_iter_order order;
         } cgroup;
+       struct {
+               enum bpf_iter_task_type type;
+               u32 pid;
+       } task;
   };
   
   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 962960a98835..f212a19eda06 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
                 __u32   cgroup_fd;
                 __u64   cgroup_id;
         } cgroup;
+       /* Parameters of task iterators. */
+       struct {
+               __u32   tid;
+               __u32   pid;
+               __u32   pid_fd;
+       } task;
   };
   
   /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..0bc7277d1ee1 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,9 @@
   
   struct bpf_iter_seq_task_common {
         struct pid_namespace *ns;
+       enum bpf_iter_task_type type;
+       u32 pid;
+       u32 pid_visiting;
   };
   
   struct bpf_iter_seq_task_info {
@@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info {
         u32 tid;
   };
   
-static struct task_struct *task_seq_get_next(struct pid_namespace
*ns,
+static struct task_struct *task_group_seq_get_next(struct
bpf_iter_seq_task_common *common,
+                                                  u32 *tid,
+                                                  bool
skip_if_dup_files)
+{
+       struct task_struct *task, *next_task;
+       struct pid *pid;
+       u32 saved_tid;
+
+       if (!*tid) {

Add a comment in the above to say that this is for the *very first*
visit of tasks in the process.

+               pid = find_pid_ns(common->pid, common->ns);
+               if (pid)
+                       task = get_pid_task(pid, PIDTYPE_TGID);

'task' is not initialized, so it is possible task could hold a
garbase value here if !pid, right?

Also if indeed task is NULL, here, should we return NULL here
first?

yes, it should return earlier.


+
+               *tid = common->pid;
+               common->pid_visiting = common->pid;
+
+               return task;
+       }
+
+       /* The callers increase *tid by 1 once they want next task.
+        * However, next_thread() doesn't return tasks in
incremental
+        * order of pids. We can not find next task by just finding
a
+        * task whose pid is greater or equal to *tid.
pid_visiting
+        * remembers the pid value of the task returned last time.
By
+        * comparing pid_visiting and *tid, we known if the caller
+        * wants the next task.
+        */
+       if (*tid == common->pid_visiting) {
+               pid = find_pid_ns(common->pid_visiting, common-
ns);
+               task = get_pid_task(pid, PIDTYPE_PID);
+
+               return task;
+       }

Do not understand the above code. Why we need it? Looks like
the code below trying to get the *next_task* and will return NULL
if wrap around happens(the tid again equals tgid), right?

The above code is to handle the case that the caller want to visit the
same task again.  For example, task_file_seq_get_next() will call this
function several time to return the same task, and move to next task by
increasing info->tid.  The above code checks the value of *tid to
return the same task if the value doesn't change.

Could you explain when task_file_seq_get_next() will call this function
several times? IIUC, from the following code,

+static struct task_struct *task_seq_get_next(struct 
bpf_iter_seq_task_common *common,
 					     u32 *tid,
 					     bool skip_if_dup_files)
 {
 	struct task_struct *task = NULL;
 	struct pid *pid;

+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid && *tid != common->pid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->pid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_TGID);
+			*tid = common->pid;
+		}
+		rcu_read_unlock();
+
+		return task;
+	}
+
+	if (common->type == BPF_TASK_ITER_TGID) {
+		rcu_read_lock();
+		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+		rcu_read_unlock();
+
+		return task;
+	}
+

task_group_seq_get_next() is only called once per task_seq_get_next()
for BPF_TASK_ITER_TGID.
Maybe I missed something?





+
+retry:
+       pid = find_pid_ns(common->pid_visiting, common->ns);
+       if (!pid)
+               return NULL;
+
+       task = get_pid_task(pid, PIDTYPE_PID);
+       if (!task)
+               return NULL;
+
+       next_task = next_thread(task);
+       put_task_struct(task);
+       if (!next_task)
+               return NULL;
+
+       saved_tid = *tid;
+       *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common-
ns);
+       if (*tid == common->pid) {
+               /* Run out of tasks of a process.  The tasks of a
+                * thread_group are linked as circular linked list.
+                */
+               *tid = saved_tid;
+               return NULL;
+       }
+
+       get_task_struct(next_task);
+       common->pid_visiting = *tid;

We could do quite some redundant works here if the following
condition is true. Basically, we get next_task and get a tid
and release it, but in the next iteration, from tid, we try to get
the task again.

Yes, I will move 'retry' and move next_task to task to avoid the
redundant work.


+
+       if (skip_if_dup_files && task->files == task->group_leader-
files)
+               goto retry;
+
+       return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct
bpf_iter_seq_task_common *common,
                                              u32 *tid,
                                              bool
skip_if_dup_files)
   {
         struct task_struct *task = NULL;
         struct pid *pid;
   
+       if (common->type == BPF_TASK_ITER_TID) {
+               if (*tid && *tid != common->pid)
+                       return NULL;
+               rcu_read_lock();
+               pid = find_pid_ns(common->pid, common->ns);
+               if (pid) {
+                       task = get_pid_task(pid, PIDTYPE_TGID);
+                       *tid = common->pid;
+               }
+               rcu_read_unlock();
+
+               return task;
+       }
+
+       if (common->type == BPF_TASK_ITER_TGID) {
+               rcu_read_lock();
+               task = task_group_seq_get_next(common, tid,
skip_if_dup_files);
+               rcu_read_unlock();
+
+               return task;
+       }
+
         rcu_read_lock();
   retry:
-       pid = find_ge_pid(*tid, ns);
+       pid = find_ge_pid(*tid, common->ns);
         if (pid) {
-               *tid = pid_nr_ns(pid, ns);
+               *tid = pid_nr_ns(pid, common->ns);
                 task = get_pid_task(pid, PIDTYPE_PID);
                 if (!task) {
                         ++*tid;
@@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file
*seq, loff_t *pos)
         struct bpf_iter_seq_task_info *info = seq->private;
         struct task_struct *task;
   
-       task = task_seq_get_next(info->common.ns, &info->tid,
false);
+       task = task_seq_get_next(&info->common, &info->tid, false);
         if (!task)
                 return NULL;
   
@@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq,
void *v, loff_t *pos)
         ++*pos;
         ++info->tid;
         put_task_struct((struct task_struct *)v);
-       task = task_seq_get_next(info->common.ns, &info->tid,
false);
+       task = task_seq_get_next(&info->common, &info->tid, false);
         if (!task)
                 return NULL;
   
@@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file
*seq, void *v)
                 put_task_struct((struct task_struct *)v);
   }
   
[...]