Re: [PATCH v2 00/12] uprobes: add batched register/unregister APIs and per-CPU RW semaphore

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Tue, 2 Jul 2024 10:54:51 -0700

On Tue, Jul 2, 2024 at 4:54 AM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
>
> +LKML
>
> On Tue, Jul 02, 2024 at 12:23:53PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 01, 2024 at 03:39:23PM -0700, Andrii Nakryiko wrote:
> > > This patch set, ultimately, switches global uprobes_treelock from RW spinlock
> > > to per-CPU RW semaphore, which has better performance and scales better under
> > > contention and multiple parallel threads triggering lots of uprobes.
> >
> > Why not RCU + normal lock thing?
>
> Something like the *completely* untested below.
>
> ---
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 2c83ba776fc7..03b38f3f7be3 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
>  #define no_uprobe_events()     RB_EMPTY_ROOT(&uprobes_tree)
>
>  static DEFINE_RWLOCK(uprobes_treelock);        /* serialize rbtree access */
> +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
>
>  #define UPROBES_HASH_SZ        13
>  /* serialize uprobe->pending_list */
> @@ -54,6 +55,7 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
>  struct uprobe {
>         struct rb_node          rb_node;        /* node in the rb tree */
>         refcount_t              ref;
> +       struct rcu_head         rcu;
>         struct rw_semaphore     register_rwsem;
>         struct rw_semaphore     consumer_rwsem;
>         struct list_head        pending_list;
> @@ -67,7 +69,7 @@ struct uprobe {
>          * The generic code assumes that it has two members of unknown type
>          * owned by the arch-specific code:
>          *
> -        *      insn -  copy_insn() saves the original instruction here for
> +        *      insn -  copy_insn() saves the original instruction here for
>          *              arch_uprobe_analyze_insn().
>          *
>          *      ixol -  potentially modified instruction to execute out of
> @@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe)
>         return uprobe;
>  }
>
> +static void uprobe_free_rcu(struct rcu_head *rcu)
> +{
> +       struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
> +       kfree(uprobe);
> +}
> +
>  static void put_uprobe(struct uprobe *uprobe)
>  {
>         if (refcount_dec_and_test(&uprobe->ref)) {
> @@ -604,7 +612,8 @@ static void put_uprobe(struct uprobe *uprobe)

right above this we have roughly this:

percpu_down_write(&uprobes_treelock);

/* refcount check */
rb_erase(&uprobe->rb_node, &uprobes_tree);

percpu_up_write(&uprobes_treelock);

This writer lock is necessary for modification of the RB tree. And I
was under impression that I shouldn't be doing
percpu_(down|up)_write() inside the normal
rcu_read_lock()/rcu_read_unlock() region (percpu_down_write has
might_sleep() in it). But maybe I'm wrong, hopefully Paul can help to
clarify.

But actually what's wrong with RCU Tasks Trace flavor? I will
ultimately use it anyway to avoid uprobe taking unnecessary refcount
and to protect uprobe->consumers iteration and uc->handler() calls,
which could be sleepable, so would need rcu_read_lock_trace().

>                 mutex_lock(&delayed_uprobe_lock);
>                 delayed_uprobe_remove(uprobe, NULL);
>                 mutex_unlock(&delayed_uprobe_lock);
> -               kfree(uprobe);
> +
> +               call_rcu(&uprobe->rcu, uprobe_free_rcu);
>         }
>  }
>
> @@ -668,12 +677,25 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
>  static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
>  {
>         struct uprobe *uprobe;
> +       unsigned seq;
>
> -       read_lock(&uprobes_treelock);
> -       uprobe = __find_uprobe(inode, offset);
> -       read_unlock(&uprobes_treelock);
> +       guard(rcu)();
>
> -       return uprobe;
> +       do {
> +               seq = read_seqcount_begin(&uprobes_seqcount);
> +               uprobes = __find_uprobe(inode, offset);
> +               if (uprobes) {
> +                       /*
> +                        * Lockless RB-tree lookups are prone to false-negatives.
> +                        * If they find something, it's good. If they do not find,
> +                        * it needs to be validated.
> +                        */
> +                       return uprobes;
> +               }
> +       } while (read_seqcount_retry(&uprobes_seqcount, seq));
> +
> +       /* Really didn't find anything. */
> +       return NULL;
>  }

Honest question here, as I don't understand the tradeoffs well enough.
Is there a lot of benefit to switching to seqcount lock vs using
percpu RW semaphore (previously recommended by Ingo). The latter is a
nice drop-in replacement and seems to be very fast and scale well.
Right now we are bottlenecked on uprobe->register_rwsem (not
uprobes_treelock anymore), which is currently limiting the scalability
of uprobes and I'm going to work on that next once I'm done with this
series.

>
>  static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
> @@ -702,7 +724,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
>         struct uprobe *u;
>
>         write_lock(&uprobes_treelock);
> +       write_seqcount_begin(&uprobes_seqcount);
>         u = __insert_uprobe(uprobe);
> +       write_seqcount_end(&uprobes_seqcount);
>         write_unlock(&uprobes_treelock);
>
>         return u;
> @@ -936,7 +960,9 @@ static void delete_uprobe(struct uprobe *uprobe)
>                 return;
>
>         write_lock(&uprobes_treelock);
> +       write_seqcount_begin(&uprobes_seqcount);
>         rb_erase(&uprobe->rb_node, &uprobes_tree);
> +       write_seqcount_end(&uprobes_seqcount);
>         write_unlock(&uprobes_treelock);
>         RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
>         put_uprobe(uprobe);