> On Nov 3, 2021, at 4:54 PM, Song Liu <songliubraving@xxxxxx> wrote: > > > >> On Nov 1, 2021, at 3:23 PM, Daniel Borkmann <daniel@xxxxxxxxxxxxx> wrote: >> >> On 10/28/21 12:00 AM, Song Liu wrote: >> [...] >>> /* integer value in 'imm' field of BPF_CALL instruction selects which helper >>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c >>> index b48750bfba5aa..ad30f2e885356 100644 >>> --- a/kernel/bpf/task_iter.c >>> +++ b/kernel/bpf/task_iter.c >>> @@ -8,6 +8,7 @@ >>> #include <linux/fdtable.h> >>> #include <linux/filter.h> >>> #include <linux/btf_ids.h> >>> +#include <linux/irq_work.h> >>> struct bpf_iter_seq_task_common { >>> struct pid_namespace *ns; >>> @@ -21,6 +22,25 @@ struct bpf_iter_seq_task_info { >>> u32 tid; >>> }; >>> +/* irq_work to run mmap_read_unlock() */ >>> +struct task_iter_irq_work { >>> + struct irq_work irq_work; >>> + struct mm_struct *mm; >>> +}; >>> + >>> +static DEFINE_PER_CPU(struct task_iter_irq_work, mmap_unlock_work); >>> + >>> +static void do_mmap_read_unlock(struct irq_work *entry) >>> +{ >>> + struct task_iter_irq_work *work; >>> + >>> + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) >>> + return; >>> + >>> + work = container_of(entry, struct task_iter_irq_work, irq_work); >>> + mmap_read_unlock_non_owner(work->mm); >>> +} >>> + >>> static struct task_struct *task_seq_get_next(struct pid_namespace *ns, >>> u32 *tid, >>> bool skip_if_dup_files) >>> @@ -586,9 +606,89 @@ static struct bpf_iter_reg task_vma_reg_info = { >>> .seq_info = &task_vma_seq_info, >>> }; >>> +BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, >>> + bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) >>> +{ >>> + struct task_iter_irq_work *work = NULL; >>> + struct mm_struct *mm = task->mm; >> >> Won't this NULL deref if called with task argument as NULL? > > Will fix. > >> >>> + struct vm_area_struct *vma; >>> + bool irq_work_busy = false; >>> + int ret = -ENOENT; >>> + >>> + if (flags) >>> + return -EINVAL; >>> + >>> + if (!mm) >>> + return -ENOENT; >>> + >>> + /* >>> + * Similar to stackmap with build_id support, we cannot simply do >>> + * mmap_read_unlock when the irq is disabled. Instead, we need do >>> + * the unlock in the irq_work. >>> + */ >>> + if (irqs_disabled()) { >>> + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { >>> + work = this_cpu_ptr(&mmap_unlock_work); >>> + if (irq_work_is_busy(&work->irq_work)) { >>> + /* cannot queue more mmap_unlock, abort. */ >>> + irq_work_busy = true; >>> + } >>> + } else { >>> + /* >>> + * PREEMPT_RT does not allow to trylock mmap sem in >>> + * interrupt disabled context, abort. >>> + */ >>> + irq_work_busy = true; >>> + } >>> + } >>> + >>> + if (irq_work_busy || !mmap_read_trylock(mm)) >>> + return -EBUSY; >>> + >>> + vma = find_vma(mm, start); >>> + >>> + if (vma && vma->vm_start <= start && vma->vm_end > start) { >>> + callback_fn((u64)(long)task, (u64)(long)vma, >>> + (u64)(long)callback_ctx, 0, 0); >>> + ret = 0; >>> + } >>> + if (!work) { >>> + mmap_read_unlock(current->mm); >>> + } else { >>> + work->mm = current->mm; >>> + >>> + /* The lock will be released once we're out of interrupt >>> + * context. Tell lockdep that we've released it now so >>> + * it doesn't complain that we forgot to release it. >>> + */ >>> + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); >>> + irq_work_queue(&work->irq_work); >>> + } >> >> Given this is pretty much the same logic around the vma retrieval, could this be >> refactored/consolidated with stack map build id retrieval into a common function? > > I thought about sharing the irq_work code amount the two. The problem was we need > to include irq_work.h in bpf.h. But on a second thought, maybe we should just > move bpf_find_vma to stackmap.c? This will avoid including irq_work.h. I guess it > is not too weird to have bpf_find_vma in stackmap.c. Actually, we can just add a local header for it in kernel/bpf. Adding bpf_find_vma to stackmap.c means bpf_find_vma requires CONFIG_PERF_EVENTS. It is not a real issue in most systems, but may break some build tests. Song