On 8/29/22 12:23 PM, Kui-Feng Lee wrote:
Allow creating an iterator that loops through resources of one
thread/process.
People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process. Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.
Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxx>
Acked-by: Yonghong Song <yhs@xxxxxx>
---
include/linux/bpf.h | 25 +++++
include/uapi/linux/bpf.h | 6 ++
kernel/bpf/task_iter.c | 184 +++++++++++++++++++++++++++++----
tools/include/uapi/linux/bpf.h | 6 ++
4 files changed, 199 insertions(+), 22 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9c1674973e03..31ac2c1181f5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
extern int bpf_iter_ ## target(args); \
int __init bpf_iter_ ## target(args) { return 0; }
+/*
+ * The task type of iterators.
+ *
+ * For BPF task iterators, they can be parameterized with various
+ * parameters to visit only some of tasks.
+ *
+ * BPF_TASK_ITER_ALL (default)
+ * Iterate over resources of every task.
+ *
+ * BPF_TASK_ITER_TID
+ * Iterate over resources of a task/tid.
+ *
+ * BPF_TASK_ITER_TGID
+ * Iterate over resources of every task of a process / task group.
+ */
+enum bpf_iter_task_type {
+ BPF_TASK_ITER_ALL = 0,
+ BPF_TASK_ITER_TID,
+ BPF_TASK_ITER_TGID,
+};
+
struct bpf_iter_aux_info {
/* for map_elem iter */
struct bpf_map *map;
@@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info {
struct cgroup *start; /* starting cgroup */
enum bpf_cgroup_iter_order order;
} cgroup;
+ struct {
+ enum bpf_iter_task_type type;
+ u32 pid;
+ } task;
};
typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 962960a98835..f212a19eda06 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
__u32 cgroup_fd;
__u64 cgroup_id;
} cgroup;
+ /* Parameters of task iterators. */
+ struct {
+ __u32 tid;
+ __u32 pid;
+ __u32 pid_fd;
+ } task;
};
/* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..0bc7277d1ee1 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,9 @@
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
};
struct bpf_iter_seq_task_info {
@@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info {
u32 tid;
};
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task, *next_task;
+ struct pid *pid;
+ u32 saved_tid;
+
+ if (!*tid) {
Add a comment in the above to say that this is for the *very first*
visit of tasks in the process.
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid)
+ task = get_pid_task(pid, PIDTYPE_TGID);
'task' is not initialized, so it is possible task could hold a
garbase value here if !pid, right?
Also if indeed task is NULL, here, should we return NULL here
first?
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* The callers increase *tid by 1 once they want next task.
+ * However, next_thread() doesn't return tasks in incremental
+ * order of pids. We can not find next task by just finding a
+ * task whose pid is greater or equal to *tid. pid_visiting
+ * remembers the pid value of the task returned last time. By
+ * comparing pid_visiting and *tid, we known if the caller
+ * wants the next task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
Do not understand the above code. Why we need it? Looks like
the code below trying to get the *next_task* and will return NULL
if wrap around happens(the tid again equals tgid), right?
+
+retry:
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return NULL;
+
+ next_task = next_thread(task);
+ put_task_struct(task);
+ if (!next_task)
+ return NULL;
+
+ saved_tid = *tid;
+ *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+ if (*tid == common->pid) {
+ /* Run out of tasks of a process. The tasks of a
+ * thread_group are linked as circular linked list.
+ */
+ *tid = saved_tid;
+ return NULL;
+ }
+
+ get_task_struct(next_task);
+ common->pid_visiting = *tid;
We could do quite some redundant works here if the following
condition is true. Basically, we get next_task and get a tid
and release it, but in the next iteration, from tid, we try to get
the task again.
+
+ if (skip_if_dup_files && task->files == task->group_leader->files)
+ goto retry;
+
+ return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
u32 *tid,
bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
rcu_read_lock();
retry:
- pid = find_ge_pid(*tid, ns);
+ pid = find_ge_pid(*tid, common->ns);
if (pid) {
- *tid = pid_nr_ns(pid, ns);
+ *tid = pid_nr_ns(pid, common->ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
@@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file *seq, void *v)
put_task_struct((struct task_struct *)v);
}
[...]