Re: [PATCH bpf-next v5 1/3] bpf: Parameterize task iterators.

Kui-Feng Lee <kuifeng@xxxxxx> · Tue, 16 Aug 2022 19:11:17 +0000

On Mon, 2022-08-15 at 16:08 -0700, Hao Luo wrote:
> !-------------------------------------------------------------------|
>   This Message Is From an External Sender
> 
> > -------------------------------------------------------------------
> > !
> 
> Hi Kui-Feng,
> 
> On Wed, Aug 10, 2022 at 5:17 PM Kui-Feng Lee <kuifeng@xxxxxx> wrote:
> > 
> > Allow creating an iterator that loops through resources of one
> > task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were
> > interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx>
> > ---
> >  include/linux/bpf.h            |  29 ++++++++
> >  include/uapi/linux/bpf.h       |   8 +++
> >  kernel/bpf/task_iter.c         | 126 ++++++++++++++++++++++++++---
> > ----
> >  tools/include/uapi/linux/bpf.h |   8 +++
> >  4 files changed, 147 insertions(+), 24 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..6bbe53d06faa 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user
> > *pathname, int flags);
> >         extern int bpf_iter_ ## target(args);                   \
> >         int __init bpf_iter_ ## target(args) { return 0; }
> > 
> > +/*
> > + * The task type of iterators.
> > + *
> > + * For BPF task iterators, they can be parameterized with various
> > + * parameters to visit only some of tasks.
> > + *
> > + * BPF_TASK_ITER_ALL (default)
> > + *     Iterate over resources of every task.
> > + *
> > + * BPF_TASK_ITER_TID
> > + *     Iterate over resources of a task/tid.
> > + *
> > + * BPF_TASK_ITER_TGID
> > + *     Iterate over reosurces of evevry task of a process / task
> > group.
> 
> typos: resources and every.
> 
> > + */
> > +enum bpf_iter_task_type {
> > +       BPF_TASK_ITER_ALL = 0,
> > +       BPF_TASK_ITER_TID,
> > +       BPF_TASK_ITER_TGID,
> > +};
> > +
> >  struct bpf_iter_aux_info {
> >         struct bpf_map *map;
> > +       struct {
> > +               enum bpf_iter_task_type type;
> > +               union {
> > +                       u32 tid;
> > +                       u32 tgid;
> > +                       u32 pid_fd;
> > +               };
> > +       } task;
> >  };
> > 
> >  typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index ffcbf79a556b..6328aca0cf5c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -91,6 +91,14 @@ union bpf_iter_link_info {
> >         struct {
> >                 __u32   map_fd;
> >         } map;
> > +       /*
> > +        * Parameters of task iterators.
> > +        */
> 
> We could remove this particular comment. It is kind of obvious.
> 
> > +       struct {
> > +               __u32   tid;
> > +               __u32   tgid;
> > +               __u32   pid_fd;
> > +       } task;
> >  };
> > 
> >  /* BPF syscall commands, see bpf(2) man-page for more details. */
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 8c921799def4..f2e21efe075d 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -12,6 +12,12 @@
> > 
> >  struct bpf_iter_seq_task_common {
> >         struct pid_namespace *ns;
> > +       enum bpf_iter_task_type type;
> > +       union {
> > +               u32 tid;
> > +               u32 tgid;
> > +               u32 pid_fd;
> > +       };
> >  };
> > 
> >  struct bpf_iter_seq_task_info {
> > @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info {
> >         u32 tid;
> >  };
> > 
> > -static struct task_struct *task_seq_get_next(struct pid_namespace
> > *ns,
> > +static struct task_struct *task_seq_get_next(struct
> > bpf_iter_seq_task_common *common,
> >                                              u32 *tid,
> >                                              bool
> > skip_if_dup_files)
> >  {
> >         struct task_struct *task = NULL;
> >         struct pid *pid;
> > 
> > +       if (common->type == BPF_TASK_ITER_TID) {
> > +               if (*tid && *tid != common->tid)
> > +                       return NULL;
> > +               rcu_read_lock();
> > +               pid = find_pid_ns(common->tid, common->ns);
> > +               if (pid) {
> > +                       task = get_pid_task(pid, PIDTYPE_PID);
> > +                       *tid = common->tid;
> > +               }
> > +               rcu_read_unlock();
> 
> nit: this is ok. But I think the commonly used pattern (e.g.
> proc_pid_lookup) is
> 
>         rcu_read_lock();
>         task = find_task_by_pid_ns(tid, ns);
>         if (task)
>                 get_task_struct(task);
>         rcu_read_unlock();
> 
> > +               return task;
> > +       }
> > +
> >         rcu_read_lock();
> >  retry:
> > -       pid = find_ge_pid(*tid, ns);
> > +       pid = find_ge_pid(*tid, common->ns);
> >         if (pid) {
> > -               *tid = pid_nr_ns(pid, ns);
> > +               *tid = pid_nr_ns(pid, common->ns);
> >                 task = get_pid_task(pid, PIDTYPE_PID);
> > +
> >                 if (!task) {
> >                         ++*tid;
> >                         goto retry;
> > -               } else if (skip_if_dup_files &&
> > !thread_group_leader(task) &&
> > -                          task->files == task->group_leader-
> > >files) {
> > +               } else if ((skip_if_dup_files &&
> > !thread_group_leader(task) &&
> > +                           task->files == task->group_leader-
> > >files) ||
> > +                          (common->type == BPF_TASK_ITER_TGID &&
> > +                           __task_pid_nr_ns(task, PIDTYPE_TGID,
> > common->ns) != common->tgid)) {
> 
> Use task_tgid_nr_ns instead of __task_pid_nr_ns?
> 
> >                         put_task_struct(task);
> >                         task = NULL;
> >                         ++*tid;
> > @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq,
> > loff_t *pos)
> >         struct bpf_iter_seq_task_info *info = seq->private;
> >         struct task_struct *task;
> > 
> > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > false);
> > +       task = task_seq_get_next(&info->common, &info->tid, false);
> > +
> >         if (!task)
> >                 return NULL;
> > 
> > @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq,
> > void *v, loff_t *pos)
> >         ++*pos;
> >         ++info->tid;
> >         put_task_struct((struct task_struct *)v);
> > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > false);
> > +
> > +       task = task_seq_get_next(&info->common, &info->tid, false);
> >         if (!task)
> >                 return NULL;
> > 
> > @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file
> > *seq, void *v)
> >                 put_task_struct((struct task_struct *)v);
> >  }
> > 
> > +static int bpf_iter_attach_task(struct bpf_prog *prog,
> > +                               union bpf_iter_link_info *linfo,
> > +                               struct bpf_iter_aux_info *aux)
> > +{
> > +       unsigned int flags;
> > +       struct pid_namespace *ns;
> > +       struct pid *pid;
> > +       pid_t tgid;
> > +
> > +       if (linfo->task.tid != 0) {
> > +               aux->task.type = BPF_TASK_ITER_TID;
> > +               aux->task.tid = linfo->task.tid;
> > +       } else if (linfo->task.tgid != 0) {
> > +               aux->task.type = BPF_TASK_ITER_TGID;
> > +               aux->task.tgid = linfo->task.tgid;
> > +       } else if (linfo->task.pid_fd != 0) {
> > +               aux->task.type = BPF_TASK_ITER_TGID;
> > +               pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
> > +               if (IS_ERR(pid))
> > +                       return PTR_ERR(pid);
> > +
> > +               ns = task_active_pid_ns(current);
> > +               if (IS_ERR(ns))
> > +                       return PTR_ERR(ns);
> > +
> > +               tgid = pid_nr_ns(pid, ns);
> > +               if (tgid <= 0)
> > +                       return -EINVAL;
> 
> Is this just pid_vnr?
> 
> > +
> > +               aux->task.tgid = tgid;
> > +       } else {
> > +               aux->task.type = BPF_TASK_ITER_ALL;
> > +       }
> 
> The same question as Yonghong has. Do we need to enforce that at most
> one of {tid, tgid, pid_fd} is non-zero?
> 
> > +
> > +       return 0;
> > +}
> > +
> >  static const struct seq_operations task_seq_ops = {
> >         .start  = task_seq_start,
> >         .next   = task_seq_next,
> > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info {
> >  static struct file *
> >  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
> >  {
> > -       struct pid_namespace *ns = info->common.ns;
> > -       u32 curr_tid = info->tid;
> > +       u32 saved_tid = info->tid;
> >         struct task_struct *curr_task;
> >         unsigned int curr_fd = info->fd;
> > 
> > @@ -151,21 +211,18 @@ task_file_seq_get_next(struct
> > bpf_iter_seq_task_file_info *info)
> >                 curr_task = info->task;
> >                 curr_fd = info->fd;
> >         } else {
> > -                curr_task = task_seq_get_next(ns, &curr_tid,
> > true);
> > +               curr_task = task_seq_get_next(&info->common, &info-
> > >tid, true);
> >                  if (!curr_task) {
> >                          info->task = NULL;
> > -                        info->tid = curr_tid;
> >                          return NULL;
> >                  }
> > 
> > -                /* set info->task and info->tid */
> > +               /* set info->task */
> >                 info->task = curr_task;
> > -               if (curr_tid == info->tid) {
> > +               if (saved_tid == info->tid)
> >                         curr_fd = info->fd;
> > -               } else {
> > -                       info->tid = curr_tid;
> > +               else
> >                         curr_fd = 0;
> > -               }
> >         }
> > 
> >         rcu_read_lock();
> > @@ -186,9 +243,15 @@ task_file_seq_get_next(struct
> > bpf_iter_seq_task_file_info *info)
> >         /* the current task is done, go to the next task */
> >         rcu_read_unlock();
> >         put_task_struct(curr_task);
> > +
> > +       if (info->common.type == BPF_TASK_ITER_TID) {
> > +               info->task = NULL;
> > +               return NULL;
> > +       }
> > +
> 
> Do we need to set info->fd to 0? I am not sure if the caller reads
> info->fd anywhere. I think it would be good to do some refactoring on
> task_file_seq_get_next().
> 
> >         info->task = NULL;
> >         info->fd = 0;
> > -       curr_tid = ++(info->tid);
> > +       saved_tid = ++(info->tid);
> >         goto again;
> >  }
> > 
> > @@ -269,6 +332,17 @@ static int init_seq_pidns(void *priv_data,
> > struct bpf_iter_aux_info *aux)
> >         struct bpf_iter_seq_task_common *common = priv_data;
> > 
> >         common->ns = get_pid_ns(task_active_pid_ns(current));
> > +       common->type = aux->task.type;
> > +       switch (common->type) {
> > +       case BPF_TASK_ITER_TID:
> > +               common->tid = aux->task.tid;
> > +               break;
> > +       case BPF_TASK_ITER_TGID:
> > +               common->tgid = aux->task.tgid;
> > +               break;
> > +       default:
> 
> very nit: IMHO we could place a warning here.
> 
> > +               break;
> > +       }
> >         return 0;
> >  }
> > 
> > @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op {
> >  static struct vm_area_struct *
> >  task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
> >  {
> > -       struct pid_namespace *ns = info->common.ns;
> >         enum bpf_task_vma_iter_find_op op;
> >         struct vm_area_struct *curr_vma;
> >         struct task_struct *curr_task;
> > -       u32 curr_tid = info->tid;
> > +       u32 saved_tid = info->tid;
> > 
> 
> Why do we need to directly operate on info->tid while other task
> iters
> (e.g. vma_iter) uses curr_tid? IMHO, prefer staying using curr_tid if
> possible, for two reasons:
>  - consistent with other iters.
>  - decouple refactoring changes from the changes that introduce new
> features

These are reasonable considerations.  I chagned it to make it easy to
maintained the value of info->tid.  With curr_tid, we always put the
curr_tid's value back to info->tid before leaving the function, and it
is obfuscating and error-prone, IMHO.

I may add another patch for the purpose of factoring.

> 
> >         /* If this function returns a non-NULL vma, it holds a
> > reference to
> >          * the task_struct, and holds read lock on vma->mm-
> > >mmap_lock.
> > @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct
> > bpf_iter_seq_task_vma_info *info)
> >                 }
> >         } else {
> >  again:
> > -               curr_task = task_seq_get_next(ns, &curr_tid, true);
> > +               curr_task = task_seq_get_next(&info->common, &info-
> > >tid, true);
> >                 if (!curr_task) {
> > -                       info->tid = curr_tid + 1;
> > +                       info->tid++;
> >                         goto finish;
> >                 }
> > 
> > -               if (curr_tid != info->tid) {
> > -                       info->tid = curr_tid;
> > +               if (saved_tid != info->tid) {
> >                         /* new task, process the first vma */
> >                         op = task_vma_iter_first_vma;
> >                 } else {
> > @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct
> > bpf_iter_seq_task_vma_info *info)
> >         return curr_vma;
> > 
> >  next_task:
> > +       if (info->common.type == BPF_TASK_ITER_TID)
> > +               goto finish;
> > +
> >         put_task_struct(curr_task);
> >         info->task = NULL;
> > -       curr_tid++;
> > +       info->tid++;
> >         goto again;
> > 
> >  finish:
> > @@ -533,6 +608,7 @@ static const struct bpf_iter_seq_info
> > task_seq_info = {
> > 
> >  static struct bpf_iter_reg task_reg_info = {
> >         .target                 = "task",
> > +       .attach_target          = bpf_iter_attach_task,
> >         .feature                = BPF_ITER_RESCHED,
> >         .ctx_arg_info_size      = 1,
> >         .ctx_arg_info           = {
> > @@ -551,6 +627,7 @@ static const struct bpf_iter_seq_info
> > task_file_seq_info = {
> > 
> >  static struct bpf_iter_reg task_file_reg_info = {
> >         .target                 = "task_file",
> > +       .attach_target          = bpf_iter_attach_task,
> >         .feature                = BPF_ITER_RESCHED,
> >         .ctx_arg_info_size      = 2,
> >         .ctx_arg_info           = {
> > @@ -571,6 +648,7 @@ static const struct bpf_iter_seq_info
> > task_vma_seq_info = {
> > 
> >  static struct bpf_iter_reg task_vma_reg_info = {
> >         .target                 = "task_vma",
> > +       .attach_target          = bpf_iter_attach_task,
> >         .feature                = BPF_ITER_RESCHED,
> >         .ctx_arg_info_size      = 2,
> >         .ctx_arg_info           = {
> > diff --git a/tools/include/uapi/linux/bpf.h
> > b/tools/include/uapi/linux/bpf.h
> > index ffcbf79a556b..6328aca0cf5c 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -91,6 +91,14 @@ union bpf_iter_link_info {
> >         struct {
> >                 __u32   map_fd;
> >         } map;
> > +       /*
> > +        * Parameters of task iterators.
> > +        */
> > +       struct {
> > +               __u32   tid;
> > +               __u32   tgid;
> > +               __u32   pid_fd;
> > +       } task;
> >  };
> > 
> >  /* BPF syscall commands, see bpf(2) man-page for more details. */
> > --
> > 2.30.2
> >