Re: [PATCH v4 4/8] uprobes: travers uprobe's consumer list locklessly under SRCU protection

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Aug 29, 2024 at 04:31:18PM -0700, Andrii Nakryiko wrote:
> On Thu, Aug 29, 2024 at 4:10 PM Jiri Olsa <olsajiri@xxxxxxxxx> wrote:
> >
> > On Thu, Aug 29, 2024 at 11:37:37AM -0700, Andrii Nakryiko wrote:
> > > uprobe->register_rwsem is one of a few big bottlenecks to scalability of
> > > uprobes, so we need to get rid of it to improve uprobe performance and
> > > multi-CPU scalability.
> > >
> > > First, we turn uprobe's consumer list to a typical doubly-linked list
> > > and utilize existing RCU-aware helpers for traversing such lists, as
> > > well as adding and removing elements from it.
> > >
> > > For entry uprobes we already have SRCU protection active since before
> > > uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
> > > won't go away from under us, but we add SRCU protection around consumer
> > > list traversal.
> > >
> > > Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
> > > we remember whether any removal was requested during handler calls, but
> > > then we double-check the decision under a proper register_rwsem using
> > > consumers' filter callbacks. Handler removal is very rare, so this extra
> > > lock won't hurt performance, overall, but we also avoid the need for any
> > > extra protection (e.g., seqcount locks).
> > >
> > > Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
> > > ---
> > >  include/linux/uprobes.h |   2 +-
> > >  kernel/events/uprobes.c | 104 +++++++++++++++++++++++-----------------
> > >  2 files changed, 62 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > > index 9cf0dce62e4c..29c935b0d504 100644
> > > --- a/include/linux/uprobes.h
> > > +++ b/include/linux/uprobes.h
> > > @@ -35,7 +35,7 @@ struct uprobe_consumer {
> > >                               struct pt_regs *regs);
> > >       bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
> > >
> > > -     struct uprobe_consumer *next;
> > > +     struct list_head cons_node;
> > >  };
> > >
> > >  #ifdef CONFIG_UPROBES
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index 8bdcdc6901b2..97e58d160647 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -59,7 +59,7 @@ struct uprobe {
> > >       struct rw_semaphore     register_rwsem;
> > >       struct rw_semaphore     consumer_rwsem;
> > >       struct list_head        pending_list;
> > > -     struct uprobe_consumer  *consumers;
> > > +     struct list_head        consumers;
> > >       struct inode            *inode;         /* Also hold a ref to inode */
> > >       struct rcu_head         rcu;
> > >       loff_t                  offset;
> > > @@ -783,6 +783,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
> > >       uprobe->inode = inode;
> > >       uprobe->offset = offset;
> > >       uprobe->ref_ctr_offset = ref_ctr_offset;
> > > +     INIT_LIST_HEAD(&uprobe->consumers);
> > >       init_rwsem(&uprobe->register_rwsem);
> > >       init_rwsem(&uprobe->consumer_rwsem);
> > >       RB_CLEAR_NODE(&uprobe->rb_node);
> > > @@ -808,32 +809,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
> > >  static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > >  {
> > >       down_write(&uprobe->consumer_rwsem);
> > > -     uc->next = uprobe->consumers;
> > > -     uprobe->consumers = uc;
> > > +     list_add_rcu(&uc->cons_node, &uprobe->consumers);
> > >       up_write(&uprobe->consumer_rwsem);
> > >  }
> > >
> > >  /*
> > >   * For uprobe @uprobe, delete the consumer @uc.
> > > - * Return true if the @uc is deleted successfully
> > > - * or return false.
> > > + * Should never be called with consumer that's not part of @uprobe->consumers.
> > >   */
> > > -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > > +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > >  {
> > > -     struct uprobe_consumer **con;
> > > -     bool ret = false;
> > > -
> > >       down_write(&uprobe->consumer_rwsem);
> > > -     for (con = &uprobe->consumers; *con; con = &(*con)->next) {
> > > -             if (*con == uc) {
> > > -                     *con = uc->next;
> > > -                     ret = true;
> > > -                     break;
> > > -             }
> > > -     }
> > > +     list_del_rcu(&uc->cons_node);
> > >       up_write(&uprobe->consumer_rwsem);
> > > -
> > > -     return ret;
> > >  }
> > >
> > >  static int __copy_insn(struct address_space *mapping, struct file *filp,
> > > @@ -929,7 +917,8 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
> > >       bool ret = false;
> > >
> > >       down_read(&uprobe->consumer_rwsem);
> > > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > >               ret = consumer_filter(uc, mm);
> > >               if (ret)
> > >                       break;
> > > @@ -1125,18 +1114,29 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
> > >       int err;
> > >
> > >       down_write(&uprobe->register_rwsem);
> > > -     if (WARN_ON(!consumer_del(uprobe, uc))) {
> > > -             err = -ENOENT;
> > > -     } else {
> > > -             err = register_for_each_vma(uprobe, NULL);
> > > -             /* TODO : cant unregister? schedule a worker thread */
> > > -             if (unlikely(err))
> > > -                     uprobe_warn(current, "unregister, leaking uprobe");
> > > -     }
> > > +     consumer_del(uprobe, uc);
> > > +     err = register_for_each_vma(uprobe, NULL);
> > >       up_write(&uprobe->register_rwsem);
> > >
> > > -     if (!err)
> > > -             put_uprobe(uprobe);
> > > +     /* TODO : cant unregister? schedule a worker thread */
> > > +     if (unlikely(err)) {
> > > +             uprobe_warn(current, "unregister, leaking uprobe");
> > > +             goto out_sync;
> > > +     }
> > > +
> > > +     put_uprobe(uprobe);
> > > +
> > > +out_sync:
> > > +     /*
> > > +      * Now that handler_chain() and handle_uretprobe_chain() iterate over
> > > +      * uprobe->consumers list under RCU protection without holding
> > > +      * uprobe->register_rwsem, we need to wait for RCU grace period to
> > > +      * make sure that we can't call into just unregistered
> > > +      * uprobe_consumer's callbacks anymore. If we don't do that, fast and
> > > +      * unlucky enough caller can free consumer's memory and cause
> > > +      * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
> > > +      */
> > > +     synchronize_srcu(&uprobes_srcu);
> > >  }
> > >  EXPORT_SYMBOL_GPL(uprobe_unregister);
> > >
> > > @@ -1214,13 +1214,20 @@ EXPORT_SYMBOL_GPL(uprobe_register);
> > >  int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
> > >  {
> > >       struct uprobe_consumer *con;
> > > -     int ret = -ENOENT;
> > > +     int ret = -ENOENT, srcu_idx;
> > >
> > >       down_write(&uprobe->register_rwsem);
> > > -     for (con = uprobe->consumers; con && con != uc ; con = con->next)
> > > -             ;
> > > -     if (con)
> > > -             ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > > +
> > > +     srcu_idx = srcu_read_lock(&uprobes_srcu);
> > > +     list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
> > > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > > +             if (con == uc) {
> > > +                     ret = register_for_each_vma(uprobe, add ? uc : NULL);
> > > +                     break;
> > > +             }
> > > +     }
> > > +     srcu_read_unlock(&uprobes_srcu, srcu_idx);
> > > +
> > >       up_write(&uprobe->register_rwsem);
> > >
> > >       return ret;
> > > @@ -2085,10 +2092,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> > >       struct uprobe_consumer *uc;
> > >       int remove = UPROBE_HANDLER_REMOVE;
> > >       bool need_prep = false; /* prepare return uprobe, when needed */
> > > +     bool has_consumers = false;
> > >
> > > -     down_read(&uprobe->register_rwsem);
> > >       current->utask->auprobe = &uprobe->arch;
> > > -     for (uc = uprobe->consumers; uc; uc = uc->next) {
> > > +
> > > +     list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
> > > +                              srcu_read_lock_held(&uprobes_srcu)) {
> > >               int rc = 0;
> > >
> > >               if (uc->handler) {
> > > @@ -2101,17 +2110,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
> > >                       need_prep = true;
> > >
> > >               remove &= rc;
> > > +             has_consumers = true;
> > >       }
> > >       current->utask->auprobe = NULL;
> > >
> > >       if (need_prep && !remove)
> > >               prepare_uretprobe(uprobe, regs); /* put bp at return */
> > >
> > > -     if (remove && uprobe->consumers) {
> > > -             WARN_ON(!uprobe_is_active(uprobe));
> > > -             unapply_uprobe(uprobe, current->mm);
> > > +     if (remove && has_consumers) {
> > > +             down_read(&uprobe->register_rwsem);
> > > +
> > > +             /* re-check that removal is still required, this time under lock */
> > > +             if (!filter_chain(uprobe, current->mm)) {
> >
> > sorry for late question, but I do not follow this change..
> >
> > at this point we got 1 as handler's return value from all the uprobe's consumers,
> > why do we need to call filter_chain in here.. IIUC this will likely skip over
> > the removal?
> >
> 
> Because we don't hold register_rwsem we are now racing with
> registration. So while we can get all consumers at the time we were
> iterating over the consumer list to request deletion, a parallel CPU
> can add another consumer that needs this uprobe+PID combination. So if
> we don't double-check, we are risking having a consumer that will not
> be triggered for the desired process.
> 
> Does it make sense? Given removal is rare, it's ok to take lock if we
> *suspect* removal, and then check authoritatively again under lock.

with this change the probe will not get removed in the attached test,
it'll get 2 hits, without this change just 1 hit

but I'm not sure it's a big problem, because seems like that's not the
intended way the removal should be used anyway, as explained by Oleg [1]

jirka


[1] https://lore.kernel.org/linux-trace-kernel/ZtHKTtn7sqaLeVxV@krava/T/#m07cdc37307cfd06f17f5755a067c9b300a19ee78

---
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index bf6ca8e3eb13..86d37a8e6169 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -6,6 +6,7 @@
 #include "uprobe_multi.skel.h"
 #include "uprobe_multi_bench.skel.h"
 #include "uprobe_multi_usdt.skel.h"
+#include "uprobe_multi_removal.skel.h"
 #include "bpf/libbpf_internal.h"
 #include "testing_helpers.h"
 #include "../sdt.h"
@@ -687,6 +688,28 @@ static void test_bench_attach_usdt(void)
 	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
 }
 
+static void test_removal(void)
+{
+	struct uprobe_multi_removal *skel = NULL;
+	int err;
+
+	skel = uprobe_multi_removal__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi_removal__open_and_load"))
+		return;
+
+	err = uprobe_multi_removal__attach(skel);
+	if (!ASSERT_OK(err, "uprobe_multi_removal__attach"))
+		goto cleanup;
+
+	uprobe_multi_func_1();
+	uprobe_multi_func_1();
+
+	ASSERT_EQ(skel->bss->test, 1, "test");
+
+cleanup:
+	uprobe_multi_removal__destroy(skel);
+}
+
 void test_uprobe_multi_test(void)
 {
 	if (test__start_subtest("skel_api"))
@@ -703,4 +726,6 @@ void test_uprobe_multi_test(void)
 		test_bench_attach_usdt();
 	if (test__start_subtest("attach_api_fails"))
 		test_attach_api_fails();
+	if (test__start_subtest("removal"))
+		test_removal();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_removal.c b/tools/testing/selftests/bpf/progs/uprobe_multi_removal.c
new file mode 100644
index 000000000000..0a948cc1e05b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_removal.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/usdt.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+int test;
+
+SEC("uprobe.multi//proc/self/exe:uprobe_multi_func_1")
+int uprobe(struct pt_regs *ctx)
+{
+	test++;
+	return 1;
+}




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux