On Sun, Aug 19, 2018 at 10:53 PM, Song Liu <liu.song.a23@xxxxxxxxx> wrote: > On Sun, Aug 19, 2018 at 9:42 PM, Ravi Bangoria > <ravi.bangoria@xxxxxxxxxxxxx> wrote: >> Userspace Statically Defined Tracepoints[1] are dtrace style markers >> inside userspace applications. Applications like PostgreSQL, MySQL, >> Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc >> have these markers embedded in them. These markers are added by developer >> at important places in the code. Each marker source expands to a single >> nop instruction in the compiled code but there may be additional >> overhead for computing the marker arguments which expands to couple of >> instructions. In case the overhead is more, execution of it can be >> omitted by runtime if() condition when no one is tracing on the marker: >> >> if (reference_counter > 0) { >> Execute marker instructions; >> } >> >> Default value of reference counter is 0. Tracer has to increment the >> reference counter before tracing on a marker and decrement it when >> done with the tracing. >> >> Implement the reference counter logic in core uprobe. User will be >> able to use it from trace_uprobe as well as from kernel module. New >> trace_uprobe definition with reference counter will now be: >> >> <path>:<offset>[(ref_ctr_offset)] >> >> where ref_ctr_offset is an optional field. For kernel module, new >> variant of uprobe_register() has been introduced: >> >> uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer) >> >> No new variant for uprobe_unregister() because it's assumed to have >> only one reference counter for one uprobe. >> >> [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation >> >> Note: 'reference counter' is called as 'semaphore' in original Dtrace >> (or Systemtap, bcc and even in ELF) documentation and code. But the >> term 'semaphore' is misleading in this context. This is just a counter >> used to hold number of tracers tracing on a marker. This is not really >> used for any synchronization. So we are calling it a 'reference counter' >> in kernel / perf code. >> >> Signed-off-by: Ravi Bangoria <ravi.bangoria@xxxxxxxxxxxxx> >> Reviewed-by: Masami Hiramatsu <mhiramat@xxxxxxxxxx> >> [Only trace_uprobe.c] >> Reviewed-by: Oleg Nesterov <oleg@xxxxxxxxxx> > > Reviewed-by: Song Liu <songliubraving@xxxxxx> Reviewed-and-tested-by: Song Liu <songliubraving@xxxxxx> > >> --- >> include/linux/uprobes.h | 5 + >> kernel/events/uprobes.c | 259 ++++++++++++++++++++++++++++++++++++++++++-- >> kernel/trace/trace.c | 2 +- >> kernel/trace/trace_uprobe.c | 38 ++++++- >> 4 files changed, 293 insertions(+), 11 deletions(-) >> >> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h >> index bb9d2084af03..103a48a48872 100644 >> --- a/include/linux/uprobes.h >> +++ b/include/linux/uprobes.h >> @@ -123,6 +123,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); >> extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); >> extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); >> extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); >> +extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); >> extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); >> extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); >> extern int uprobe_mmap(struct vm_area_struct *vma); >> @@ -160,6 +161,10 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) >> { >> return -ENOSYS; >> } >> +static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) >> +{ >> + return -ENOSYS; >> +} >> static inline int >> uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) >> { >> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c >> index 919c1ce32beb..35065febcb6c 100644 >> --- a/kernel/events/uprobes.c >> +++ b/kernel/events/uprobes.c >> @@ -73,6 +73,7 @@ struct uprobe { >> struct uprobe_consumer *consumers; >> struct inode *inode; /* Also hold a ref to inode */ >> loff_t offset; >> + loff_t ref_ctr_offset; >> unsigned long flags; >> >> /* >> @@ -88,6 +89,15 @@ struct uprobe { >> struct arch_uprobe arch; >> }; >> >> +struct delayed_uprobe { >> + struct list_head list; >> + struct uprobe *uprobe; >> + struct mm_struct *mm; >> +}; >> + >> +static DEFINE_MUTEX(delayed_uprobe_lock); >> +static LIST_HEAD(delayed_uprobe_list); >> + >> /* >> * Execute out of line area: anonymous executable mapping installed >> * by the probed task to execute the copy of the original instruction >> @@ -282,6 +292,166 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t >> return 1; >> } >> >> +static struct delayed_uprobe * >> +delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) >> +{ >> + struct delayed_uprobe *du; >> + >> + list_for_each_entry(du, &delayed_uprobe_list, list) >> + if (du->uprobe == uprobe && du->mm == mm) >> + return du; >> + return NULL; >> +} >> + >> +static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) >> +{ >> + struct delayed_uprobe *du; >> + >> + if (delayed_uprobe_check(uprobe, mm)) >> + return 0; >> + >> + du = kzalloc(sizeof(*du), GFP_KERNEL); >> + if (!du) >> + return -ENOMEM; >> + >> + du->uprobe = uprobe; >> + du->mm = mm; >> + list_add(&du->list, &delayed_uprobe_list); >> + return 0; >> +} >> + >> +static void delayed_uprobe_delete(struct delayed_uprobe *du) >> +{ >> + if (WARN_ON(!du)) >> + return; >> + list_del(&du->list); >> + kfree(du); >> +} >> + >> +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) >> +{ >> + struct list_head *pos, *q; >> + struct delayed_uprobe *du; >> + >> + if (!uprobe && !mm) >> + return; >> + >> + list_for_each_safe(pos, q, &delayed_uprobe_list) { >> + du = list_entry(pos, struct delayed_uprobe, list); >> + >> + if (uprobe && du->uprobe != uprobe) >> + continue; >> + if (mm && du->mm != mm) >> + continue; >> + >> + delayed_uprobe_delete(du); >> + } >> +} >> + >> +static bool valid_ref_ctr_vma(struct uprobe *uprobe, >> + struct vm_area_struct *vma) >> +{ >> + unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); >> + >> + return uprobe->ref_ctr_offset && >> + vma->vm_file && >> + file_inode(vma->vm_file) == uprobe->inode && >> + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && >> + vma->vm_start <= vaddr && >> + vma->vm_end > vaddr; >> +} >> + >> +static struct vm_area_struct * >> +find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) >> +{ >> + struct vm_area_struct *tmp; >> + >> + for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) >> + if (valid_ref_ctr_vma(uprobe, tmp)) >> + return tmp; >> + >> + return NULL; >> +} >> + >> +static int >> +__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) >> +{ >> + void *kaddr; >> + struct page *page; >> + struct vm_area_struct *vma; >> + int ret; >> + short *ptr; >> + >> + if (!vaddr || !d) >> + return -EINVAL; >> + >> + ret = get_user_pages_remote(NULL, mm, vaddr, 1, >> + FOLL_WRITE, &page, &vma, NULL); >> + if (unlikely(ret <= 0)) { >> + /* >> + * We are asking for 1 page. If get_user_pages_remote() fails, >> + * it may return 0, in that case we have to return error. >> + */ >> + return ret == 0 ? -EBUSY : ret; >> + } >> + >> + kaddr = kmap_atomic(page); >> + ptr = kaddr + (vaddr & ~PAGE_MASK); >> + >> + if (unlikely(*ptr + d < 0)) { >> + pr_warn("ref_ctr going negative. vaddr: 0x%lx, " >> + "curr val: %d, delta: %d\n", vaddr, *ptr, d); >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + *ptr += d; >> + ret = 0; >> +out: >> + kunmap_atomic(kaddr); >> + put_page(page); >> + return ret; >> +} >> + >> +static void update_ref_ctr_warn(struct uprobe *uprobe, >> + struct mm_struct *mm, short d) >> +{ >> + pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " >> + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", >> + d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, >> + (unsigned long long) uprobe->offset, >> + (unsigned long long) uprobe->ref_ctr_offset, mm); >> +} >> + >> +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, >> + short d) >> +{ >> + struct vm_area_struct *rc_vma; >> + unsigned long rc_vaddr; >> + int ret = 0; >> + >> + rc_vma = find_ref_ctr_vma(uprobe, mm); >> + >> + if (rc_vma) { >> + rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); >> + ret = __update_ref_ctr(mm, rc_vaddr, d); >> + if (ret) >> + update_ref_ctr_warn(uprobe, mm, d); >> + >> + if (d > 0) >> + return ret; >> + } >> + >> + mutex_lock(&delayed_uprobe_lock); >> + if (d > 0) >> + ret = delayed_uprobe_add(uprobe, mm); >> + else >> + delayed_uprobe_remove(uprobe, mm); >> + mutex_unlock(&delayed_uprobe_lock); >> + >> + return ret; >> +} >> + >> /* >> * NOTE: >> * Expect the breakpoint instruction to be the smallest size instruction for >> @@ -302,9 +472,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t >> int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, >> unsigned long vaddr, uprobe_opcode_t opcode) >> { >> + struct uprobe *uprobe; >> struct page *old_page, *new_page; >> struct vm_area_struct *vma; >> - int ret; >> + int ret, is_register, ref_ctr_updated = 0; >> + >> + is_register = is_swbp_insn(&opcode); >> + uprobe = container_of(auprobe, struct uprobe, arch); >> >> retry: >> /* Read the page with vaddr into memory */ >> @@ -317,6 +491,15 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, >> if (ret <= 0) >> goto put_old; >> >> + /* We are going to replace instruction, update ref_ctr. */ >> + if (!ref_ctr_updated && uprobe->ref_ctr_offset) { >> + ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); >> + if (ret) >> + goto put_old; >> + >> + ref_ctr_updated = 1; >> + } >> + >> ret = anon_vma_prepare(vma); >> if (ret) >> goto put_old; >> @@ -337,6 +520,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, >> >> if (unlikely(ret == -EAGAIN)) >> goto retry; >> + >> + /* Revert back reference counter if instruction update failed. */ >> + if (ret && is_register && ref_ctr_updated) >> + update_ref_ctr(uprobe, mm, -1); >> + >> return ret; >> } >> >> @@ -378,8 +566,15 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe) >> >> static void put_uprobe(struct uprobe *uprobe) >> { >> - if (atomic_dec_and_test(&uprobe->ref)) >> + if (atomic_dec_and_test(&uprobe->ref)) { >> + /* >> + * If application munmap(exec_vma) before uprobe_unregister() >> + * gets called, we don't get a chance to remove uprobe from >> + * delayed_uprobe_list from remove_breakpoint(). Do it here. >> + */ >> + delayed_uprobe_remove(uprobe, NULL); >> kfree(uprobe); >> + } >> } >> >> static int match_uprobe(struct uprobe *l, struct uprobe *r) >> @@ -484,7 +679,8 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) >> return u; >> } >> >> -static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) >> +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, >> + loff_t ref_ctr_offset) >> { >> struct uprobe *uprobe, *cur_uprobe; >> >> @@ -494,6 +690,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) >> >> uprobe->inode = inode; >> uprobe->offset = offset; >> + uprobe->ref_ctr_offset = ref_ctr_offset; >> init_rwsem(&uprobe->register_rwsem); >> init_rwsem(&uprobe->consumer_rwsem); >> >> @@ -895,7 +1092,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); >> * else return 0 (success) >> */ >> static int __uprobe_register(struct inode *inode, loff_t offset, >> - struct uprobe_consumer *uc) >> + loff_t ref_ctr_offset, struct uprobe_consumer *uc) >> { >> struct uprobe *uprobe; >> int ret; >> @@ -912,7 +1109,7 @@ static int __uprobe_register(struct inode *inode, loff_t offset, >> return -EINVAL; >> >> retry: >> - uprobe = alloc_uprobe(inode, offset); >> + uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); >> if (!uprobe) >> return -ENOMEM; >> /* >> @@ -938,10 +1135,17 @@ static int __uprobe_register(struct inode *inode, loff_t offset, >> int uprobe_register(struct inode *inode, loff_t offset, >> struct uprobe_consumer *uc) >> { >> - return __uprobe_register(inode, offset, uc); >> + return __uprobe_register(inode, offset, 0, uc); >> } >> EXPORT_SYMBOL_GPL(uprobe_register); >> >> +int uprobe_register_refctr(struct inode *inode, loff_t offset, >> + loff_t ref_ctr_offset, struct uprobe_consumer *uc) >> +{ >> + return __uprobe_register(inode, offset, ref_ctr_offset, uc); >> +} >> +EXPORT_SYMBOL_GPL(uprobe_register_refctr); >> + >> /* >> * uprobe_apply - unregister a already registered probe. >> * @inode: the file in which the probe has to be removed. >> @@ -1060,6 +1264,35 @@ static void build_probe_list(struct inode *inode, >> spin_unlock(&uprobes_treelock); >> } >> >> +/* @vma contains reference counter, not the probed instruction. */ >> +static int delayed_ref_ctr_inc(struct vm_area_struct *vma) >> +{ >> + struct list_head *pos, *q; >> + struct delayed_uprobe *du; >> + unsigned long vaddr; >> + int ret = 0, err = 0; >> + >> + mutex_lock(&delayed_uprobe_lock); >> + list_for_each_safe(pos, q, &delayed_uprobe_list) { >> + du = list_entry(pos, struct delayed_uprobe, list); >> + >> + if (du->mm != vma->vm_mm || >> + !valid_ref_ctr_vma(du->uprobe, vma)) >> + continue; >> + >> + vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); >> + ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); >> + if (ret) { >> + update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); >> + if (!err) >> + err = ret; >> + } >> + delayed_uprobe_delete(du); >> + } >> + mutex_unlock(&delayed_uprobe_lock); >> + return err; >> +} >> + >> /* >> * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. >> * >> @@ -1072,7 +1305,15 @@ int uprobe_mmap(struct vm_area_struct *vma) >> struct uprobe *uprobe, *u; >> struct inode *inode; >> >> - if (no_uprobe_events() || !valid_vma(vma, true)) >> + if (no_uprobe_events()) >> + return 0; >> + >> + if (vma->vm_file && >> + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && >> + test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) >> + delayed_ref_ctr_inc(vma); >> + >> + if (!valid_vma(vma, true)) >> return 0; >> >> inode = file_inode(vma->vm_file); >> @@ -1246,6 +1487,10 @@ void uprobe_clear_state(struct mm_struct *mm) >> { >> struct xol_area *area = mm->uprobes_state.xol_area; >> >> + mutex_lock(&delayed_uprobe_lock); >> + delayed_uprobe_remove(NULL, mm); >> + mutex_unlock(&delayed_uprobe_lock); >> + >> if (!area) >> return; >> >> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c >> index 2dad27809794..23689831f656 100644 >> --- a/kernel/trace/trace.c >> +++ b/kernel/trace/trace.c >> @@ -4620,7 +4620,7 @@ static const char readme_msg[] = >> "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" >> #endif >> #ifdef CONFIG_UPROBE_EVENTS >> - "\t place: <path>:<offset>\n" >> + " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n" >> #endif >> "\t args: <name>=fetcharg[:type]\n" >> "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" >> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c >> index ac02fafc9f1b..a7ef6c4ca16e 100644 >> --- a/kernel/trace/trace_uprobe.c >> +++ b/kernel/trace/trace_uprobe.c >> @@ -59,6 +59,7 @@ struct trace_uprobe { >> struct inode *inode; >> char *filename; >> unsigned long offset; >> + unsigned long ref_ctr_offset; >> unsigned long nhit; >> struct trace_probe tp; >> }; >> @@ -364,10 +365,10 @@ static int register_trace_uprobe(struct trace_uprobe *tu) >> static int create_trace_uprobe(int argc, char **argv) >> { >> struct trace_uprobe *tu; >> - char *arg, *event, *group, *filename; >> + char *arg, *event, *group, *filename, *rctr, *rctr_end; >> char buf[MAX_EVENT_NAME_LEN]; >> struct path path; >> - unsigned long offset; >> + unsigned long offset, ref_ctr_offset; >> bool is_delete, is_return; >> int i, ret; >> >> @@ -376,6 +377,7 @@ static int create_trace_uprobe(int argc, char **argv) >> is_return = false; >> event = NULL; >> group = NULL; >> + ref_ctr_offset = 0; >> >> /* argc must be >= 1 */ >> if (argv[0][0] == '-') >> @@ -450,6 +452,26 @@ static int create_trace_uprobe(int argc, char **argv) >> goto fail_address_parse; >> } >> >> + /* Parse reference counter offset if specified. */ >> + rctr = strchr(arg, '('); >> + if (rctr) { >> + rctr_end = strchr(rctr, ')'); >> + if (rctr > rctr_end || *(rctr_end + 1) != 0) { >> + ret = -EINVAL; >> + pr_info("Invalid reference counter offset.\n"); >> + goto fail_address_parse; >> + } >> + >> + *rctr++ = '\0'; >> + *rctr_end = '\0'; >> + ret = kstrtoul(rctr, 0, &ref_ctr_offset); >> + if (ret) { >> + pr_info("Invalid reference counter offset.\n"); >> + goto fail_address_parse; >> + } >> + } >> + >> + /* Parse uprobe offset. */ >> ret = kstrtoul(arg, 0, &offset); >> if (ret) >> goto fail_address_parse; >> @@ -484,6 +506,7 @@ static int create_trace_uprobe(int argc, char **argv) >> goto fail_address_parse; >> } >> tu->offset = offset; >> + tu->ref_ctr_offset = ref_ctr_offset; >> tu->path = path; >> tu->filename = kstrdup(filename, GFP_KERNEL); >> >> @@ -602,6 +625,9 @@ static int probes_seq_show(struct seq_file *m, void *v) >> trace_event_name(&tu->tp.call), tu->filename, >> (int)(sizeof(void *) * 2), tu->offset); >> >> + if (tu->ref_ctr_offset) >> + seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); >> + >> for (i = 0; i < tu->tp.nr_args; i++) >> seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); >> >> @@ -917,7 +943,13 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, >> >> tu->consumer.filter = filter; >> tu->inode = d_real_inode(tu->path.dentry); >> - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); >> + if (tu->ref_ctr_offset) { >> + ret = uprobe_register_refctr(tu->inode, tu->offset, >> + tu->ref_ctr_offset, &tu->consumer); >> + } else { >> + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); >> + } >> + >> if (ret) >> goto err_buffer; >> >> -- >> 2.14.4 >>