On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote: > > > On 8/29/22 12:23 PM, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one > > thread/process. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were > > interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx> > > Acked-by: Yonghong Song <yhs@xxxxxx> > > --- > > include/linux/bpf.h | 25 +++++ > > include/uapi/linux/bpf.h | 6 ++ > > kernel/bpf/task_iter.c | 184 > > +++++++++++++++++++++++++++++---- > > tools/include/uapi/linux/bpf.h | 6 ++ > > 4 files changed, 199 insertions(+), 22 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 9c1674973e03..31ac2c1181f5 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user > > *pathname, int flags); > > extern int bpf_iter_ ## target(args); \ > > int __init bpf_iter_ ## target(args) { return 0; } > > > > +/* > > + * The task type of iterators. > > + * > > + * For BPF task iterators, they can be parameterized with various > > + * parameters to visit only some of tasks. > > + * > > + * BPF_TASK_ITER_ALL (default) > > + * Iterate over resources of every task. > > + * > > + * BPF_TASK_ITER_TID > > + * Iterate over resources of a task/tid. > > + * > > + * BPF_TASK_ITER_TGID > > + * Iterate over resources of every task of a process / task > > group. > > + */ > > +enum bpf_iter_task_type { > > + BPF_TASK_ITER_ALL = 0, > > + BPF_TASK_ITER_TID, > > + BPF_TASK_ITER_TGID, > > +}; > > + > > struct bpf_iter_aux_info { > > /* for map_elem iter */ > > struct bpf_map *map; > > @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { > > struct cgroup *start; /* starting cgroup */ > > enum bpf_cgroup_iter_order order; > > } cgroup; > > + struct { > > + enum bpf_iter_task_type type; > > + u32 pid; > > + } task; > > }; > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index 962960a98835..f212a19eda06 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -110,6 +110,12 @@ union bpf_iter_link_info { > > __u32 cgroup_fd; > > __u64 cgroup_id; > > } cgroup; > > + /* Parameters of task iterators. */ > > + struct { > > + __u32 tid; > > + __u32 pid; > > + __u32 pid_fd; > > + } task; > > }; > > > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > index 8c921799def4..0bc7277d1ee1 100644 > > --- a/kernel/bpf/task_iter.c > > +++ b/kernel/bpf/task_iter.c > > @@ -12,6 +12,9 @@ > > > > struct bpf_iter_seq_task_common { > > struct pid_namespace *ns; > > + enum bpf_iter_task_type type; > > + u32 pid; > > + u32 pid_visiting; > > }; > > > > struct bpf_iter_seq_task_info { > > @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { > > u32 tid; > > }; > > > > -static struct task_struct *task_seq_get_next(struct pid_namespace > > *ns, > > +static struct task_struct *task_group_seq_get_next(struct > > bpf_iter_seq_task_common *common, > > + u32 *tid, > > + bool > > skip_if_dup_files) > > +{ > > + struct task_struct *task, *next_task; > > + struct pid *pid; > > + u32 saved_tid; > > + > > + if (!*tid) { > > Add a comment in the above to say that this is for the *very first* > visit of tasks in the process. > > > + pid = find_pid_ns(common->pid, common->ns); > > + if (pid) > > + task = get_pid_task(pid, PIDTYPE_TGID); > > 'task' is not initialized, so it is possible task could hold a > garbase value here if !pid, right? > > Also if indeed task is NULL, here, should we return NULL here > first? yes, it should return earlier. > > > + > > + *tid = common->pid; > > + common->pid_visiting = common->pid; > > + > > + return task; > > + } > > + > > + /* The callers increase *tid by 1 once they want next task. > > + * However, next_thread() doesn't return tasks in > > incremental > > + * order of pids. We can not find next task by just finding > > a > > + * task whose pid is greater or equal to *tid. > > pid_visiting > > + * remembers the pid value of the task returned last time. > > By > > + * comparing pid_visiting and *tid, we known if the caller > > + * wants the next task. > > + */ > > + if (*tid == common->pid_visiting) { > > + pid = find_pid_ns(common->pid_visiting, common- > > >ns); > > + task = get_pid_task(pid, PIDTYPE_PID); > > + > > + return task; > > + } > > Do not understand the above code. Why we need it? Looks like > the code below trying to get the *next_task* and will return NULL > if wrap around happens(the tid again equals tgid), right? The above code is to handle the case that the caller want to visit the same task again. For example, task_file_seq_get_next() will call this function several time to return the same task, and move to next task by increasing info->tid. The above code checks the value of *tid to return the same task if the value doesn't change. > > > + > > +retry: > > + pid = find_pid_ns(common->pid_visiting, common->ns); > > + if (!pid) > > + return NULL; > > + > > + task = get_pid_task(pid, PIDTYPE_PID); > > + if (!task) > > + return NULL; > > + > > + next_task = next_thread(task); > > + put_task_struct(task); > > + if (!next_task) > > + return NULL; > > + > > + saved_tid = *tid; > > + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common- > > >ns); > > + if (*tid == common->pid) { > > + /* Run out of tasks of a process. The tasks of a > > + * thread_group are linked as circular linked list. > > + */ > > + *tid = saved_tid; > > + return NULL; > > + } > > + > > + get_task_struct(next_task); > > + common->pid_visiting = *tid; > > We could do quite some redundant works here if the following > condition is true. Basically, we get next_task and get a tid > and release it, but in the next iteration, from tid, we try to get > the task again. Yes, I will move 'retry' and move next_task to task to avoid the redundant work. > > > + > > + if (skip_if_dup_files && task->files == task->group_leader- > > >files) > > + goto retry; > > + > > + return next_task; > > +} > > + > > +static struct task_struct *task_seq_get_next(struct > > bpf_iter_seq_task_common *common, > > u32 *tid, > > bool > > skip_if_dup_files) > > { > > struct task_struct *task = NULL; > > struct pid *pid; > > > > + if (common->type == BPF_TASK_ITER_TID) { > > + if (*tid && *tid != common->pid) > > + return NULL; > > + rcu_read_lock(); > > + pid = find_pid_ns(common->pid, common->ns); > > + if (pid) { > > + task = get_pid_task(pid, PIDTYPE_TGID); > > + *tid = common->pid; > > + } > > + rcu_read_unlock(); > > + > > + return task; > > + } > > + > > + if (common->type == BPF_TASK_ITER_TGID) { > > + rcu_read_lock(); > > + task = task_group_seq_get_next(common, tid, > > skip_if_dup_files); > > + rcu_read_unlock(); > > + > > + return task; > > + } > > + > > rcu_read_lock(); > > retry: > > - pid = find_ge_pid(*tid, ns); > > + pid = find_ge_pid(*tid, common->ns); > > if (pid) { > > - *tid = pid_nr_ns(pid, ns); > > + *tid = pid_nr_ns(pid, common->ns); > > task = get_pid_task(pid, PIDTYPE_PID); > > if (!task) { > > ++*tid; > > @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file > > *seq, loff_t *pos) > > struct bpf_iter_seq_task_info *info = seq->private; > > struct task_struct *task; > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + task = task_seq_get_next(&info->common, &info->tid, false); > > if (!task) > > return NULL; > > > > @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq, > > void *v, loff_t *pos) > > ++*pos; > > ++info->tid; > > put_task_struct((struct task_struct *)v); > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + task = task_seq_get_next(&info->common, &info->tid, false); > > if (!task) > > return NULL; > > > > @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file > > *seq, void *v) > > put_task_struct((struct task_struct *)v); > > } > > > [...]