On Tue, 2022-08-30 at 19:37 -0700, Yonghong Song wrote: > > > On 8/30/22 5:35 PM, Kui-Feng Lee wrote: > > On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote: > > > > > > > > > On 8/29/22 12:23 PM, Kui-Feng Lee wrote: > > > > Allow creating an iterator that loops through resources of one > > > > thread/process. > > > > > > > > People could only create iterators to loop through all > > > > resources of > > > > files, vma, and tasks in the system, even though they were > > > > interested > > > > in only the resources of a specific task or process. Passing > > > > the > > > > additional parameters, people can now create an iterator to go > > > > through all resources or only the resources of a task. > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx> > > > > Acked-by: Yonghong Song <yhs@xxxxxx> > > > > --- > > > > include/linux/bpf.h | 25 +++++ > > > > include/uapi/linux/bpf.h | 6 ++ > > > > kernel/bpf/task_iter.c | 184 > > > > +++++++++++++++++++++++++++++---- > > > > tools/include/uapi/linux/bpf.h | 6 ++ > > > > 4 files changed, 199 insertions(+), 22 deletions(-) > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > index 9c1674973e03..31ac2c1181f5 100644 > > > > --- a/include/linux/bpf.h > > > > +++ b/include/linux/bpf.h > > > > @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user > > > > *pathname, int flags); > > > > extern int bpf_iter_ ## > > > > target(args); \ > > > > int __init bpf_iter_ ## target(args) { return 0; } > > > > > > > > +/* > > > > + * The task type of iterators. > > > > + * > > > > + * For BPF task iterators, they can be parameterized with > > > > various > > > > + * parameters to visit only some of tasks. > > > > + * > > > > + * BPF_TASK_ITER_ALL (default) > > > > + * Iterate over resources of every task. > > > > + * > > > > + * BPF_TASK_ITER_TID > > > > + * Iterate over resources of a task/tid. > > > > + * > > > > + * BPF_TASK_ITER_TGID > > > > + * Iterate over resources of every task of a process / > > > > task > > > > group. > > > > + */ > > > > +enum bpf_iter_task_type { > > > > + BPF_TASK_ITER_ALL = 0, > > > > + BPF_TASK_ITER_TID, > > > > + BPF_TASK_ITER_TGID, > > > > +}; > > > > + > > > > struct bpf_iter_aux_info { > > > > /* for map_elem iter */ > > > > struct bpf_map *map; > > > > @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { > > > > struct cgroup *start; /* starting cgroup */ > > > > enum bpf_cgroup_iter_order order; > > > > } cgroup; > > > > + struct { > > > > + enum bpf_iter_task_type type; > > > > + u32 pid; > > > > + } task; > > > > }; > > > > > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog > > > > *prog, > > > > diff --git a/include/uapi/linux/bpf.h > > > > b/include/uapi/linux/bpf.h > > > > index 962960a98835..f212a19eda06 100644 > > > > --- a/include/uapi/linux/bpf.h > > > > +++ b/include/uapi/linux/bpf.h > > > > @@ -110,6 +110,12 @@ union bpf_iter_link_info { > > > > __u32 cgroup_fd; > > > > __u64 cgroup_id; > > > > } cgroup; > > > > + /* Parameters of task iterators. */ > > > > + struct { > > > > + __u32 tid; > > > > + __u32 pid; > > > > + __u32 pid_fd; > > > > + } task; > > > > }; > > > > > > > > /* BPF syscall commands, see bpf(2) man-page for more > > > > details. */ > > > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > > > index 8c921799def4..0bc7277d1ee1 100644 > > > > --- a/kernel/bpf/task_iter.c > > > > +++ b/kernel/bpf/task_iter.c > > > > @@ -12,6 +12,9 @@ > > > > > > > > struct bpf_iter_seq_task_common { > > > > struct pid_namespace *ns; > > > > + enum bpf_iter_task_type type; > > > > + u32 pid; > > > > + u32 pid_visiting; > > > > }; > > > > > > > > struct bpf_iter_seq_task_info { > > > > @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { > > > > u32 tid; > > > > }; > > > > > > > > -static struct task_struct *task_seq_get_next(struct > > > > pid_namespace > > > > *ns, > > > > +static struct task_struct *task_group_seq_get_next(struct > > > > bpf_iter_seq_task_common *common, > > > > + u32 *tid, > > > > + bool > > > > skip_if_dup_files) > > > > +{ > > > > + struct task_struct *task, *next_task; > > > > + struct pid *pid; > > > > + u32 saved_tid; > > > > + > > > > + if (!*tid) { > > > > > > Add a comment in the above to say that this is for the *very > > > first* > > > visit of tasks in the process. > > > > > > > + pid = find_pid_ns(common->pid, common->ns); > > > > + if (pid) > > > > + task = get_pid_task(pid, PIDTYPE_TGID); > > > > > > 'task' is not initialized, so it is possible task could hold a > > > garbase value here if !pid, right? > > > > > > Also if indeed task is NULL, here, should we return NULL here > > > first? > > > > yes, it should return earlier. > > > > > > > > > + > > > > + *tid = common->pid; > > > > + common->pid_visiting = common->pid; > > > > + > > > > + return task; > > > > + } > > > > + > > > > + /* The callers increase *tid by 1 once they want next > > > > task. > > > > + * However, next_thread() doesn't return tasks in > > > > incremental > > > > + * order of pids. We can not find next task by just > > > > finding > > > > a > > > > + * task whose pid is greater or equal to *tid. > > > > pid_visiting > > > > + * remembers the pid value of the task returned last > > > > time. > > > > By > > > > + * comparing pid_visiting and *tid, we known if the > > > > caller > > > > + * wants the next task. > > > > + */ > > > > + if (*tid == common->pid_visiting) { > > > > + pid = find_pid_ns(common->pid_visiting, common- > > > > > ns); > > > > + task = get_pid_task(pid, PIDTYPE_PID); > > > > + > > > > + return task; > > > > + } > > > > > > Do not understand the above code. Why we need it? Looks like > > > the code below trying to get the *next_task* and will return NULL > > > if wrap around happens(the tid again equals tgid), right? > > > > The above code is to handle the case that the caller want to visit > > the > > same task again. For example, task_file_seq_get_next() will call > > this > > function several time to return the same task, and move to next > > task by > > increasing info->tid. The above code checks the value of *tid to > > return the same task if the value doesn't change. > > Could you explain when task_file_seq_get_next() will call this > function > several times? IIUC, from the following code, Let's assume that a userspace program reads an BPF file iterator in several times instead of reading the whole content in once. bpf_seq_read() will be called several times. Every time returning from bpf_seq_read except the last one, info->tid will be the tid of the latest visited task. bpf_seq_read() will call seq->op->start() (a.k.a task_file_seq_start() in this case). However, task_file_seq_start() will reset info->task, and call task_file_seq_get_next() to return the task which has tid giving by *tid. Since info->task is NULL and *tid is not 0, task_file_seq_get_next() will call task_seq_get_next() to retrieve the same task again. Does that make sense? > > +static struct task_struct *task_seq_get_next(struct > bpf_iter_seq_task_common *common, > u32 *tid, > bool skip_if_dup_files) > { > struct task_struct *task = NULL; > struct pid *pid; > > + if (common->type == BPF_TASK_ITER_TID) { > + if (*tid && *tid != common->pid) > + return NULL; > + rcu_read_lock(); > + pid = find_pid_ns(common->pid, common->ns); > + if (pid) { > + task = get_pid_task(pid, PIDTYPE_TGID); > + *tid = common->pid; > + } > + rcu_read_unlock(); > + > + return task; > + } > + > + if (common->type == BPF_TASK_ITER_TGID) { > + rcu_read_lock(); > + task = task_group_seq_get_next(common, tid, > skip_if_dup_files); > + rcu_read_unlock(); > + > + return task; > + } > + > > task_group_seq_get_next() is only called once per task_seq_get_next() > for BPF_TASK_ITER_TGID. > Maybe I missed something? > > > > > > > > > > > > + > > > > +retry: > > > > + pid = find_pid_ns(common->pid_visiting, common->ns); > > > > + if (!pid) > > > > + return NULL; > > > > + > > > > + task = get_pid_task(pid, PIDTYPE_PID); > > > > + if (!task) > > > > + return NULL; > > > > + > > > > + next_task = next_thread(task); > > > > + put_task_struct(task); > > > > + if (!next_task) > > > > + return NULL; > > > > + > > > > + saved_tid = *tid; > > > > + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common- > > > > > ns); > > > > + if (*tid == common->pid) { > > > > + /* Run out of tasks of a process. The tasks of > > > > a > > > > + * thread_group are linked as circular linked > > > > list. > > > > + */ > > > > + *tid = saved_tid; > > > > + return NULL; > > > > + } > > > > + > > > > + get_task_struct(next_task); > > > > + common->pid_visiting = *tid; > > > > > > We could do quite some redundant works here if the following > > > condition is true. Basically, we get next_task and get a tid > > > and release it, but in the next iteration, from tid, we try to > > > get > > > the task again. > > > > Yes, I will move 'retry' and move next_task to task to avoid the > > redundant work. > > > > > > > > > + > > > > + if (skip_if_dup_files && task->files == task- > > > > >group_leader- > > > > > files) > > > > + goto retry; > > > > + > > > > + return next_task; > > > > +} > > > > + > > > > +static struct task_struct *task_seq_get_next(struct > > > > bpf_iter_seq_task_common *common, > > > > u32 *tid, > > > > bool > > > > skip_if_dup_files) > > > > { > > > > struct task_struct *task = NULL; > > > > struct pid *pid; > > > > > > > > + if (common->type == BPF_TASK_ITER_TID) { > > > > + if (*tid && *tid != common->pid) > > > > + return NULL; > > > > + rcu_read_lock(); > > > > + pid = find_pid_ns(common->pid, common->ns); > > > > + if (pid) { > > > > + task = get_pid_task(pid, PIDTYPE_TGID); > > > > + *tid = common->pid; > > > > + } > > > > + rcu_read_unlock(); > > > > + > > > > + return task; > > > > + } > > > > + > > > > + if (common->type == BPF_TASK_ITER_TGID) { > > > > + rcu_read_lock(); > > > > + task = task_group_seq_get_next(common, tid, > > > > skip_if_dup_files); > > > > + rcu_read_unlock(); > > > > + > > > > + return task; > > > > + } > > > > + > > > > rcu_read_lock(); > > > > retry: > > > > - pid = find_ge_pid(*tid, ns); > > > > + pid = find_ge_pid(*tid, common->ns); > > > > if (pid) { > > > > - *tid = pid_nr_ns(pid, ns); > > > > + *tid = pid_nr_ns(pid, common->ns); > > > > task = get_pid_task(pid, PIDTYPE_PID); > > > > if (!task) { > > > > ++*tid; > > > > @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file > > > > *seq, loff_t *pos) > > > > struct bpf_iter_seq_task_info *info = seq->private; > > > > struct task_struct *task; > > > > > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > > > false); > > > > + task = task_seq_get_next(&info->common, &info->tid, > > > > false); > > > > if (!task) > > > > return NULL; > > > > > > > > @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file > > > > *seq, > > > > void *v, loff_t *pos) > > > > ++*pos; > > > > ++info->tid; > > > > put_task_struct((struct task_struct *)v); > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > > > false); > > > > + task = task_seq_get_next(&info->common, &info->tid, > > > > false); > > > > if (!task) > > > > return NULL; > > > > > > > > @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file > > > > *seq, void *v) > > > > put_task_struct((struct task_struct *)v); > > > > } > > > > > > > [...] > >