Re: [PATCH bpf-next v5 1/3] bpf: Parameterize task iterators.

Kui-Feng Lee <kuifeng@xxxxxx> · Tue, 16 Aug 2022 17:00:36 +0000

On Sat, 2022-08-13 at 15:17 -0700, Yonghong Song wrote:
> 
> 
> On 8/10/22 5:16 PM, Kui-Feng Lee wrote:
> > Allow creating an iterator that loops through resources of one
> > task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were
> > interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx>
> > ---
> >   include/linux/bpf.h            |  29 ++++++++
> >   include/uapi/linux/bpf.h       |   8 +++
> >   kernel/bpf/task_iter.c         | 126 ++++++++++++++++++++++++++--
> > -----
> >   tools/include/uapi/linux/bpf.h |   8 +++
> >   4 files changed, 147 insertions(+), 24 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..6bbe53d06faa 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user
> > *pathname, int flags);
> >         extern int bpf_iter_ ## target(args);                   \
> >         int __init bpf_iter_ ## target(args) { return 0; }
> >   
> > +/*
> > + * The task type of iterators.
> > + *
> > + * For BPF task iterators, they can be parameterized with various
> > + * parameters to visit only some of tasks.
> > + *
> > + * BPF_TASK_ITER_ALL (default)
> > + *     Iterate over resources of every task.
> > + *
> > + * BPF_TASK_ITER_TID
> > + *     Iterate over resources of a task/tid.
> > + *
> > + * BPF_TASK_ITER_TGID
> > + *     Iterate over reosurces of evevry task of a process / task
> > group.
> > + */
> > +enum bpf_iter_task_type {
> > +       BPF_TASK_ITER_ALL = 0,
> > +       BPF_TASK_ITER_TID,
> > +       BPF_TASK_ITER_TGID,
> > +};
> > +
> >   struct bpf_iter_aux_info {
> >         struct bpf_map *map;
> > +       struct {
> > +               enum bpf_iter_task_type type;
> > +               union {
> > +                       u32 tid;
> > +                       u32 tgid;
> > +                       u32 pid_fd;
> > +               };
> > +       } task;
> >   };
> >   
> >   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index ffcbf79a556b..6328aca0cf5c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -91,6 +91,14 @@ union bpf_iter_link_info {
> >         struct {
> >                 __u32   map_fd;
> >         } map;
> > +       /*
> > +        * Parameters of task iterators.
> > +        */
> 
> The comment can be put into one line.
> 
> > +       struct {
> > +               __u32   tid;
> > +               __u32   tgid;
> > +               __u32   pid_fd;
> 
> The above is a max of kernel and user space terminologies.
> tid/pid are user space concept and tgid is a kernel space
> concept.
> 
> In bpf uapi header, we have
> 
> struct bpf_pidns_info {
>          __u32 pid;
>          __u32 tgid;
> };
> 
> which uses kernel terminologies.
> 
> So I suggest the bpf_iter_link_info.task can also
> use pure kernel terminology pid/tgid/tgid_fd here.
> 
> Alternative, using pure user space terminology
> can be tid/pid/pid_fd but seems the kernel terminology
> might be better since we already have precedence.
> 
> 
> > +       } task;
> >   };
> >   
> >   /* BPF syscall commands, see bpf(2) man-page for more details. */
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 8c921799def4..f2e21efe075d 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -12,6 +12,12 @@
> >   
> >   struct bpf_iter_seq_task_common {
> >         struct pid_namespace *ns;
> > +       enum bpf_iter_task_type type;
> > +       union {
> > +               u32 tid;
> > +               u32 tgid;
> > +               u32 pid_fd;
> > +       };
> >   };
> >   
> >   struct bpf_iter_seq_task_info {
> > @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info {
> >         u32 tid;
> >   };
> >   
> > -static struct task_struct *task_seq_get_next(struct pid_namespace
> > *ns,
> > +static struct task_struct *task_seq_get_next(struct
> > bpf_iter_seq_task_common *common,
> >                                              u32 *tid,
> >                                              bool
> > skip_if_dup_files)
> >   {
> >         struct task_struct *task = NULL;
> >         struct pid *pid;
> >   
> > +       if (common->type == BPF_TASK_ITER_TID) {
> > +               if (*tid && *tid != common->tid)
> > +                       return NULL;
> > +               rcu_read_lock();
> > +               pid = find_pid_ns(common->tid, common->ns);
> > +               if (pid) {
> > +                       task = get_pid_task(pid, PIDTYPE_PID);
> > +                       *tid = common->tid;
> > +               }
> > +               rcu_read_unlock();
> > +               return task;
> > +       }
> > +
> >         rcu_read_lock();
> >   retry:
> > -       pid = find_ge_pid(*tid, ns);
> > +       pid = find_ge_pid(*tid, common->ns);
> >         if (pid) {
> > -               *tid = pid_nr_ns(pid, ns);
> > +               *tid = pid_nr_ns(pid, common->ns);
> >                 task = get_pid_task(pid, PIDTYPE_PID);
> > +
> 
> This extra line is unnecessary.
> 
> >                 if (!task) {
> >                         ++*tid;
> >                         goto retry;
> > -               } else if (skip_if_dup_files &&
> > !thread_group_leader(task) &&
> > -                          task->files == task->group_leader-
> > >files) {
> > +               } else if ((skip_if_dup_files &&
> > !thread_group_leader(task) &&
> > +                           task->files == task->group_leader-
> > >files) ||
> > +                          (common->type == BPF_TASK_ITER_TGID &&
> > +                           __task_pid_nr_ns(task, PIDTYPE_TGID,
> > common->ns) != common->tgid)) {
> >                         put_task_struct(task);
> >                         task = NULL;
> >                         ++*tid;
> > @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq,
> > loff_t *pos)
> >         struct bpf_iter_seq_task_info *info = seq->private;
> >         struct task_struct *task;
> >   
> > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > false);
> > +       task = task_seq_get_next(&info->common, &info->tid, false);
> > +
> 
> Extra line?
> 
> >         if (!task)
> >                 return NULL;
> >   
> > @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq,
> > void *v, loff_t *pos)
> >         ++*pos;
> >         ++info->tid;
> >         put_task_struct((struct task_struct *)v);
> > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > false);
> > +
> 
> Extra line?
> 
> > +       task = task_seq_get_next(&info->common, &info->tid, false);
> >         if (!task)
> >                 return NULL;
> >   
> > @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file
> > *seq, void *v)
> >                 put_task_struct((struct task_struct *)v);
> >   }
> >   
> > +static int bpf_iter_attach_task(struct bpf_prog *prog,
> > +                               union bpf_iter_link_info *linfo,
> > +                               struct bpf_iter_aux_info *aux)
> > +{
> > +       unsigned int flags;
> > +       struct pid_namespace *ns;
> > +       struct pid *pid;
> > +       pid_t tgid;
> 
> Follow reverse chrismas tree style?
> 
> > +
> > +       if (linfo->task.tid != 0) {
> > +               aux->task.type = BPF_TASK_ITER_TID;
> > +               aux->task.tid = linfo->task.tid;
> > +       } else if (linfo->task.tgid != 0) {
> > +               aux->task.type = BPF_TASK_ITER_TGID;
> > +               aux->task.tgid = linfo->task.tgid;
> > +       } else if (linfo->task.pid_fd != 0) {
> > +               aux->task.type = BPF_TASK_ITER_TGID;
> > +               pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
> > +               if (IS_ERR(pid))
> > +                       return PTR_ERR(pid);
> > +
> > +               ns = task_active_pid_ns(current);
> > +               if (IS_ERR(ns))
> > +                       return PTR_ERR(ns);
> > +
> > +               tgid = pid_nr_ns(pid, ns);
> > +               if (tgid <= 0)
> > +                       return -EINVAL;
> 
> Is it possible that tgid <= 0? I think no, so
> the above two lines are unnecessary.
> 
> > +
> > +               aux->task.tgid = tgid;
> 
> We leaks the reference count for 'pid' here.
> We need to add
>                 put_pid(pid);
> to release the reference for pid.
>         
> > +       } else {
> > +               aux->task.type = BPF_TASK_ITER_ALL;
> > +       }
> 
> What will happen if two or all of task.tid, task.tgid and
> task.pid_fd non-zero? Should we fail here?
> 
> > +
> > +       return 0;
> > +}
> > +
> >   static const struct seq_operations task_seq_ops = {
> >         .start  = task_seq_start,
> >         .next   = task_seq_next,
> > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info {
> >   static struct file *
> [...]
> >   
> > @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op {
> >   static struct vm_area_struct *
> >   task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
> >   {
> > -       struct pid_namespace *ns = info->common.ns;
> >         enum bpf_task_vma_iter_find_op op;
> >         struct vm_area_struct *curr_vma;
> >         struct task_struct *curr_task;
> > -       u32 curr_tid = info->tid;
> > +       u32 saved_tid = info->tid;
> >   
> >         /* If this function returns a non-NULL vma, it holds a
> > reference to
> >          * the task_struct, and holds read lock on vma->mm-
> > >mmap_lock.
> > @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct
> > bpf_iter_seq_task_vma_info *info)
> >                 }
> >         } else {
> >   again:
> > -               curr_task = task_seq_get_next(ns, &curr_tid, true);
> > +               curr_task = task_seq_get_next(&info->common, &info-
> > >tid, true);
> >                 if (!curr_task) {
> > -                       info->tid = curr_tid + 1;
> > +                       info->tid++;
> >                         goto finish;
> >                 }
> >   
> > -               if (curr_tid != info->tid) {
> > -                       info->tid = curr_tid;
> > +               if (saved_tid != info->tid) {
> >                         /* new task, process the first vma */
> >                         op = task_vma_iter_first_vma;
> >                 } else {
> > @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct
> > bpf_iter_seq_task_vma_info *info)
> >         return curr_vma;
> >   
> >   next_task:
> > +       if (info->common.type == BPF_TASK_ITER_TID)
> > +               goto finish;
> > +
> >         put_task_struct(curr_task);
> >         info->task = NULL;
> > -       curr_tid++;
> > +       info->tid++;
> 
> saved_tid = ++info->tid?

saved_tid is the value of info->tid when entering this funciton.
It is used to check if the current visiting task is the same one
entering this function.  For this purpose, updating saved_tid or not
will not change the result.  The value of info->tid will be different
from saved_tid after info->tid++ anyway, and it will show that the
current visiting task is not the one when entering this function.

> 
> >         goto again;
> >   
> >   finish:
> [...]