> On Nov 1, 2021, at 3:23 PM, Daniel Borkmann <daniel@xxxxxxxxxxxxx> wrote: > > On 10/28/21 12:00 AM, Song Liu wrote: > [...] >> /* integer value in 'imm' field of BPF_CALL instruction selects which helper >> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c >> index b48750bfba5aa..ad30f2e885356 100644 >> --- a/kernel/bpf/task_iter.c >> +++ b/kernel/bpf/task_iter.c >> @@ -8,6 +8,7 @@ >> #include <linux/fdtable.h> >> #include <linux/filter.h> >> #include <linux/btf_ids.h> >> +#include <linux/irq_work.h> >> struct bpf_iter_seq_task_common { >> struct pid_namespace *ns; >> @@ -21,6 +22,25 @@ struct bpf_iter_seq_task_info { >> u32 tid; >> }; >> +/* irq_work to run mmap_read_unlock() */ >> +struct task_iter_irq_work { >> + struct irq_work irq_work; >> + struct mm_struct *mm; >> +}; >> + >> +static DEFINE_PER_CPU(struct task_iter_irq_work, mmap_unlock_work); >> + >> +static void do_mmap_read_unlock(struct irq_work *entry) >> +{ >> + struct task_iter_irq_work *work; >> + >> + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) >> + return; >> + >> + work = container_of(entry, struct task_iter_irq_work, irq_work); >> + mmap_read_unlock_non_owner(work->mm); >> +} >> + >> static struct task_struct *task_seq_get_next(struct pid_namespace *ns, >> u32 *tid, >> bool skip_if_dup_files) >> @@ -586,9 +606,89 @@ static struct bpf_iter_reg task_vma_reg_info = { >> .seq_info = &task_vma_seq_info, >> }; >> +BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, >> + bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) >> +{ >> + struct task_iter_irq_work *work = NULL; >> + struct mm_struct *mm = task->mm; > > Won't this NULL deref if called with task argument as NULL? Will fix. > >> + struct vm_area_struct *vma; >> + bool irq_work_busy = false; >> + int ret = -ENOENT; >> + >> + if (flags) >> + return -EINVAL; >> + >> + if (!mm) >> + return -ENOENT; >> + >> + /* >> + * Similar to stackmap with build_id support, we cannot simply do >> + * mmap_read_unlock when the irq is disabled. Instead, we need do >> + * the unlock in the irq_work. >> + */ >> + if (irqs_disabled()) { >> + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { >> + work = this_cpu_ptr(&mmap_unlock_work); >> + if (irq_work_is_busy(&work->irq_work)) { >> + /* cannot queue more mmap_unlock, abort. */ >> + irq_work_busy = true; >> + } >> + } else { >> + /* >> + * PREEMPT_RT does not allow to trylock mmap sem in >> + * interrupt disabled context, abort. >> + */ >> + irq_work_busy = true; >> + } >> + } >> + >> + if (irq_work_busy || !mmap_read_trylock(mm)) >> + return -EBUSY; >> + >> + vma = find_vma(mm, start); >> + >> + if (vma && vma->vm_start <= start && vma->vm_end > start) { >> + callback_fn((u64)(long)task, (u64)(long)vma, >> + (u64)(long)callback_ctx, 0, 0); >> + ret = 0; >> + } >> + if (!work) { >> + mmap_read_unlock(current->mm); >> + } else { >> + work->mm = current->mm; >> + >> + /* The lock will be released once we're out of interrupt >> + * context. Tell lockdep that we've released it now so >> + * it doesn't complain that we forgot to release it. >> + */ >> + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); >> + irq_work_queue(&work->irq_work); >> + } > > Given this is pretty much the same logic around the vma retrieval, could this be > refactored/consolidated with stack map build id retrieval into a common function? I thought about sharing the irq_work code amount the two. The problem was we need to include irq_work.h in bpf.h. But on a second thought, maybe we should just move bpf_find_vma to stackmap.c? This will avoid including irq_work.h. I guess it is not too weird to have bpf_find_vma in stackmap.c. > >> + return ret; >> +} >> + >> +BTF_ID_LIST_SINGLE(btf_find_vma_ids, struct, task_struct) >> + >> +const struct bpf_func_proto bpf_find_vma_proto = { >> + .func = bpf_find_vma, >> + .ret_type = RET_INTEGER, >> + .arg1_type = ARG_PTR_TO_BTF_ID, >> + .arg1_btf_id = &btf_find_vma_ids[0], >> + .arg2_type = ARG_ANYTHING, >> + .arg3_type = ARG_PTR_TO_FUNC, >> + .arg4_type = ARG_PTR_TO_STACK_OR_NULL, >> + .arg5_type = ARG_ANYTHING, >> +}; > [...] >> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h >> index c108200378834..056c00da1b5d6 100644 >> --- a/tools/include/uapi/linux/bpf.h >> +++ b/tools/include/uapi/linux/bpf.h >> @@ -4915,6 +4915,24 @@ union bpf_attr { >> * Dynamically cast a *sk* pointer to a *unix_sock* pointer. >> * Return >> * *sk* if casting is valid, or **NULL** otherwise. >> + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) > > nit: Wrongly copied uapi header over to tooling? Right... You get really good eyes. :-) [...]