On Sun, Aug 19, 2018 at 9:42 PM, Ravi Bangoria <ravi.bangoria@xxxxxxxxxxxxx> wrote: > Userspace Statically Defined Tracepoints[1] are dtrace style markers > inside userspace applications. Applications like PostgreSQL, MySQL, > Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc > have these markers embedded in them. These markers are added by developer > at important places in the code. Each marker source expands to a single > nop instruction in the compiled code but there may be additional > overhead for computing the marker arguments which expands to couple of > instructions. In case the overhead is more, execution of it can be > omitted by runtime if() condition when no one is tracing on the marker: > > if (reference_counter > 0) { > Execute marker instructions; > } > > Default value of reference counter is 0. Tracer has to increment the > reference counter before tracing on a marker and decrement it when > done with the tracing. > > Implement the reference counter logic in core uprobe. User will be > able to use it from trace_uprobe as well as from kernel module. New > trace_uprobe definition with reference counter will now be: > > <path>:<offset>[(ref_ctr_offset)] > > where ref_ctr_offset is an optional field. For kernel module, new > variant of uprobe_register() has been introduced: > > uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer) > > No new variant for uprobe_unregister() because it's assumed to have > only one reference counter for one uprobe. > > [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation > > Note: 'reference counter' is called as 'semaphore' in original Dtrace > (or Systemtap, bcc and even in ELF) documentation and code. But the > term 'semaphore' is misleading in this context. This is just a counter > used to hold number of tracers tracing on a marker. This is not really > used for any synchronization. So we are calling it a 'reference counter' > in kernel / perf code. > > Signed-off-by: Ravi Bangoria <ravi.bangoria@xxxxxxxxxxxxx> > Reviewed-by: Masami Hiramatsu <mhiramat@xxxxxxxxxx> > [Only trace_uprobe.c] > Reviewed-by: Oleg Nesterov <oleg@xxxxxxxxxx> Reviewed-by: Song Liu <songliubraving@xxxxxx> > --- > include/linux/uprobes.h | 5 + > kernel/events/uprobes.c | 259 ++++++++++++++++++++++++++++++++++++++++++-- > kernel/trace/trace.c | 2 +- > kernel/trace/trace_uprobe.c | 38 ++++++- > 4 files changed, 293 insertions(+), 11 deletions(-) > > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h > index bb9d2084af03..103a48a48872 100644 > --- a/include/linux/uprobes.h > +++ b/include/linux/uprobes.h > @@ -123,6 +123,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); > extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); > extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); > extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); > +extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); > extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); > extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); > extern int uprobe_mmap(struct vm_area_struct *vma); > @@ -160,6 +161,10 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) > { > return -ENOSYS; > } > +static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) > +{ > + return -ENOSYS; > +} > static inline int > uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) > { > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c > index 919c1ce32beb..35065febcb6c 100644 > --- a/kernel/events/uprobes.c > +++ b/kernel/events/uprobes.c > @@ -73,6 +73,7 @@ struct uprobe { > struct uprobe_consumer *consumers; > struct inode *inode; /* Also hold a ref to inode */ > loff_t offset; > + loff_t ref_ctr_offset; > unsigned long flags; > > /* > @@ -88,6 +89,15 @@ struct uprobe { > struct arch_uprobe arch; > }; > > +struct delayed_uprobe { > + struct list_head list; > + struct uprobe *uprobe; > + struct mm_struct *mm; > +}; > + > +static DEFINE_MUTEX(delayed_uprobe_lock); > +static LIST_HEAD(delayed_uprobe_list); > + > /* > * Execute out of line area: anonymous executable mapping installed > * by the probed task to execute the copy of the original instruction > @@ -282,6 +292,166 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t > return 1; > } > > +static struct delayed_uprobe * > +delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) > +{ > + struct delayed_uprobe *du; > + > + list_for_each_entry(du, &delayed_uprobe_list, list) > + if (du->uprobe == uprobe && du->mm == mm) > + return du; > + return NULL; > +} > + > +static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) > +{ > + struct delayed_uprobe *du; > + > + if (delayed_uprobe_check(uprobe, mm)) > + return 0; > + > + du = kzalloc(sizeof(*du), GFP_KERNEL); > + if (!du) > + return -ENOMEM; > + > + du->uprobe = uprobe; > + du->mm = mm; > + list_add(&du->list, &delayed_uprobe_list); > + return 0; > +} > + > +static void delayed_uprobe_delete(struct delayed_uprobe *du) > +{ > + if (WARN_ON(!du)) > + return; > + list_del(&du->list); > + kfree(du); > +} > + > +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) > +{ > + struct list_head *pos, *q; > + struct delayed_uprobe *du; > + > + if (!uprobe && !mm) > + return; > + > + list_for_each_safe(pos, q, &delayed_uprobe_list) { > + du = list_entry(pos, struct delayed_uprobe, list); > + > + if (uprobe && du->uprobe != uprobe) > + continue; > + if (mm && du->mm != mm) > + continue; > + > + delayed_uprobe_delete(du); > + } > +} > + > +static bool valid_ref_ctr_vma(struct uprobe *uprobe, > + struct vm_area_struct *vma) > +{ > + unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); > + > + return uprobe->ref_ctr_offset && > + vma->vm_file && > + file_inode(vma->vm_file) == uprobe->inode && > + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && > + vma->vm_start <= vaddr && > + vma->vm_end > vaddr; > +} > + > +static struct vm_area_struct * > +find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) > +{ > + struct vm_area_struct *tmp; > + > + for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) > + if (valid_ref_ctr_vma(uprobe, tmp)) > + return tmp; > + > + return NULL; > +} > + > +static int > +__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) > +{ > + void *kaddr; > + struct page *page; > + struct vm_area_struct *vma; > + int ret; > + short *ptr; > + > + if (!vaddr || !d) > + return -EINVAL; > + > + ret = get_user_pages_remote(NULL, mm, vaddr, 1, > + FOLL_WRITE, &page, &vma, NULL); > + if (unlikely(ret <= 0)) { > + /* > + * We are asking for 1 page. If get_user_pages_remote() fails, > + * it may return 0, in that case we have to return error. > + */ > + return ret == 0 ? -EBUSY : ret; > + } > + > + kaddr = kmap_atomic(page); > + ptr = kaddr + (vaddr & ~PAGE_MASK); > + > + if (unlikely(*ptr + d < 0)) { > + pr_warn("ref_ctr going negative. vaddr: 0x%lx, " > + "curr val: %d, delta: %d\n", vaddr, *ptr, d); > + ret = -EINVAL; > + goto out; > + } > + > + *ptr += d; > + ret = 0; > +out: > + kunmap_atomic(kaddr); > + put_page(page); > + return ret; > +} > + > +static void update_ref_ctr_warn(struct uprobe *uprobe, > + struct mm_struct *mm, short d) > +{ > + pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " > + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", > + d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, > + (unsigned long long) uprobe->offset, > + (unsigned long long) uprobe->ref_ctr_offset, mm); > +} > + > +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, > + short d) > +{ > + struct vm_area_struct *rc_vma; > + unsigned long rc_vaddr; > + int ret = 0; > + > + rc_vma = find_ref_ctr_vma(uprobe, mm); > + > + if (rc_vma) { > + rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); > + ret = __update_ref_ctr(mm, rc_vaddr, d); > + if (ret) > + update_ref_ctr_warn(uprobe, mm, d); > + > + if (d > 0) > + return ret; > + } > + > + mutex_lock(&delayed_uprobe_lock); > + if (d > 0) > + ret = delayed_uprobe_add(uprobe, mm); > + else > + delayed_uprobe_remove(uprobe, mm); > + mutex_unlock(&delayed_uprobe_lock); > + > + return ret; > +} > + > /* > * NOTE: > * Expect the breakpoint instruction to be the smallest size instruction for > @@ -302,9 +472,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t > int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, > unsigned long vaddr, uprobe_opcode_t opcode) > { > + struct uprobe *uprobe; > struct page *old_page, *new_page; > struct vm_area_struct *vma; > - int ret; > + int ret, is_register, ref_ctr_updated = 0; > + > + is_register = is_swbp_insn(&opcode); > + uprobe = container_of(auprobe, struct uprobe, arch); > > retry: > /* Read the page with vaddr into memory */ > @@ -317,6 +491,15 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, > if (ret <= 0) > goto put_old; > > + /* We are going to replace instruction, update ref_ctr. */ > + if (!ref_ctr_updated && uprobe->ref_ctr_offset) { > + ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); > + if (ret) > + goto put_old; > + > + ref_ctr_updated = 1; > + } > + > ret = anon_vma_prepare(vma); > if (ret) > goto put_old; > @@ -337,6 +520,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, > > if (unlikely(ret == -EAGAIN)) > goto retry; > + > + /* Revert back reference counter if instruction update failed. */ > + if (ret && is_register && ref_ctr_updated) > + update_ref_ctr(uprobe, mm, -1); > + > return ret; > } > > @@ -378,8 +566,15 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe) > > static void put_uprobe(struct uprobe *uprobe) > { > - if (atomic_dec_and_test(&uprobe->ref)) > + if (atomic_dec_and_test(&uprobe->ref)) { > + /* > + * If application munmap(exec_vma) before uprobe_unregister() > + * gets called, we don't get a chance to remove uprobe from > + * delayed_uprobe_list from remove_breakpoint(). Do it here. > + */ > + delayed_uprobe_remove(uprobe, NULL); > kfree(uprobe); > + } > } > > static int match_uprobe(struct uprobe *l, struct uprobe *r) > @@ -484,7 +679,8 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) > return u; > } > > -static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) > +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, > + loff_t ref_ctr_offset) > { > struct uprobe *uprobe, *cur_uprobe; > > @@ -494,6 +690,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) > > uprobe->inode = inode; > uprobe->offset = offset; > + uprobe->ref_ctr_offset = ref_ctr_offset; > init_rwsem(&uprobe->register_rwsem); > init_rwsem(&uprobe->consumer_rwsem); > > @@ -895,7 +1092,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); > * else return 0 (success) > */ > static int __uprobe_register(struct inode *inode, loff_t offset, > - struct uprobe_consumer *uc) > + loff_t ref_ctr_offset, struct uprobe_consumer *uc) > { > struct uprobe *uprobe; > int ret; > @@ -912,7 +1109,7 @@ static int __uprobe_register(struct inode *inode, loff_t offset, > return -EINVAL; > > retry: > - uprobe = alloc_uprobe(inode, offset); > + uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); > if (!uprobe) > return -ENOMEM; > /* > @@ -938,10 +1135,17 @@ static int __uprobe_register(struct inode *inode, loff_t offset, > int uprobe_register(struct inode *inode, loff_t offset, > struct uprobe_consumer *uc) > { > - return __uprobe_register(inode, offset, uc); > + return __uprobe_register(inode, offset, 0, uc); > } > EXPORT_SYMBOL_GPL(uprobe_register); > > +int uprobe_register_refctr(struct inode *inode, loff_t offset, > + loff_t ref_ctr_offset, struct uprobe_consumer *uc) > +{ > + return __uprobe_register(inode, offset, ref_ctr_offset, uc); > +} > +EXPORT_SYMBOL_GPL(uprobe_register_refctr); > + > /* > * uprobe_apply - unregister a already registered probe. > * @inode: the file in which the probe has to be removed. > @@ -1060,6 +1264,35 @@ static void build_probe_list(struct inode *inode, > spin_unlock(&uprobes_treelock); > } > > +/* @vma contains reference counter, not the probed instruction. */ > +static int delayed_ref_ctr_inc(struct vm_area_struct *vma) > +{ > + struct list_head *pos, *q; > + struct delayed_uprobe *du; > + unsigned long vaddr; > + int ret = 0, err = 0; > + > + mutex_lock(&delayed_uprobe_lock); > + list_for_each_safe(pos, q, &delayed_uprobe_list) { > + du = list_entry(pos, struct delayed_uprobe, list); > + > + if (du->mm != vma->vm_mm || > + !valid_ref_ctr_vma(du->uprobe, vma)) > + continue; > + > + vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); > + ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); > + if (ret) { > + update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); > + if (!err) > + err = ret; > + } > + delayed_uprobe_delete(du); > + } > + mutex_unlock(&delayed_uprobe_lock); > + return err; > +} > + > /* > * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. > * > @@ -1072,7 +1305,15 @@ int uprobe_mmap(struct vm_area_struct *vma) > struct uprobe *uprobe, *u; > struct inode *inode; > > - if (no_uprobe_events() || !valid_vma(vma, true)) > + if (no_uprobe_events()) > + return 0; > + > + if (vma->vm_file && > + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && > + test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) > + delayed_ref_ctr_inc(vma); > + > + if (!valid_vma(vma, true)) > return 0; > > inode = file_inode(vma->vm_file); > @@ -1246,6 +1487,10 @@ void uprobe_clear_state(struct mm_struct *mm) > { > struct xol_area *area = mm->uprobes_state.xol_area; > > + mutex_lock(&delayed_uprobe_lock); > + delayed_uprobe_remove(NULL, mm); > + mutex_unlock(&delayed_uprobe_lock); > + > if (!area) > return; > > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > index 2dad27809794..23689831f656 100644 > --- a/kernel/trace/trace.c > +++ b/kernel/trace/trace.c > @@ -4620,7 +4620,7 @@ static const char readme_msg[] = > "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" > #endif > #ifdef CONFIG_UPROBE_EVENTS > - "\t place: <path>:<offset>\n" > + " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n" > #endif > "\t args: <name>=fetcharg[:type]\n" > "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" > diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c > index ac02fafc9f1b..a7ef6c4ca16e 100644 > --- a/kernel/trace/trace_uprobe.c > +++ b/kernel/trace/trace_uprobe.c > @@ -59,6 +59,7 @@ struct trace_uprobe { > struct inode *inode; > char *filename; > unsigned long offset; > + unsigned long ref_ctr_offset; > unsigned long nhit; > struct trace_probe tp; > }; > @@ -364,10 +365,10 @@ static int register_trace_uprobe(struct trace_uprobe *tu) > static int create_trace_uprobe(int argc, char **argv) > { > struct trace_uprobe *tu; > - char *arg, *event, *group, *filename; > + char *arg, *event, *group, *filename, *rctr, *rctr_end; > char buf[MAX_EVENT_NAME_LEN]; > struct path path; > - unsigned long offset; > + unsigned long offset, ref_ctr_offset; > bool is_delete, is_return; > int i, ret; > > @@ -376,6 +377,7 @@ static int create_trace_uprobe(int argc, char **argv) > is_return = false; > event = NULL; > group = NULL; > + ref_ctr_offset = 0; > > /* argc must be >= 1 */ > if (argv[0][0] == '-') > @@ -450,6 +452,26 @@ static int create_trace_uprobe(int argc, char **argv) > goto fail_address_parse; > } > > + /* Parse reference counter offset if specified. */ > + rctr = strchr(arg, '('); > + if (rctr) { > + rctr_end = strchr(rctr, ')'); > + if (rctr > rctr_end || *(rctr_end + 1) != 0) { > + ret = -EINVAL; > + pr_info("Invalid reference counter offset.\n"); > + goto fail_address_parse; > + } > + > + *rctr++ = '\0'; > + *rctr_end = '\0'; > + ret = kstrtoul(rctr, 0, &ref_ctr_offset); > + if (ret) { > + pr_info("Invalid reference counter offset.\n"); > + goto fail_address_parse; > + } > + } > + > + /* Parse uprobe offset. */ > ret = kstrtoul(arg, 0, &offset); > if (ret) > goto fail_address_parse; > @@ -484,6 +506,7 @@ static int create_trace_uprobe(int argc, char **argv) > goto fail_address_parse; > } > tu->offset = offset; > + tu->ref_ctr_offset = ref_ctr_offset; > tu->path = path; > tu->filename = kstrdup(filename, GFP_KERNEL); > > @@ -602,6 +625,9 @@ static int probes_seq_show(struct seq_file *m, void *v) > trace_event_name(&tu->tp.call), tu->filename, > (int)(sizeof(void *) * 2), tu->offset); > > + if (tu->ref_ctr_offset) > + seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); > + > for (i = 0; i < tu->tp.nr_args; i++) > seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); > > @@ -917,7 +943,13 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, > > tu->consumer.filter = filter; > tu->inode = d_real_inode(tu->path.dentry); > - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); > + if (tu->ref_ctr_offset) { > + ret = uprobe_register_refctr(tu->inode, tu->offset, > + tu->ref_ctr_offset, &tu->consumer); > + } else { > + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); > + } > + > if (ret) > goto err_buffer; > > -- > 2.14.4 >