+LKML On Tue, Jul 02, 2024 at 12:23:53PM +0200, Peter Zijlstra wrote: > On Mon, Jul 01, 2024 at 03:39:23PM -0700, Andrii Nakryiko wrote: > > This patch set, ultimately, switches global uprobes_treelock from RW spinlock > > to per-CPU RW semaphore, which has better performance and scales better under > > contention and multiple parallel threads triggering lots of uprobes. > > Why not RCU + normal lock thing? Something like the *completely* untested below. --- diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2c83ba776fc7..03b38f3f7be3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_ROOT; #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -54,6 +55,7 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); struct uprobe { struct rb_node rb_node; /* node in the rb tree */ refcount_t ref; + struct rcu_head rcu; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; @@ -67,7 +69,7 @@ struct uprobe { * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * - * insn - copy_insn() saves the original instruction here for + * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of @@ -593,6 +595,12 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe) return uprobe; } +static void uprobe_free_rcu(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + kfree(uprobe); +} + static void put_uprobe(struct uprobe *uprobe) { if (refcount_dec_and_test(&uprobe->ref)) { @@ -604,7 +612,8 @@ static void put_uprobe(struct uprobe *uprobe) mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); - kfree(uprobe); + + call_rcu(&uprobe->rcu, uprobe_free_rcu); } } @@ -668,12 +677,25 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; + unsigned seq; - read_lock(&uprobes_treelock); - uprobe = __find_uprobe(inode, offset); - read_unlock(&uprobes_treelock); + guard(rcu)(); - return uprobe; + do { + seq = read_seqcount_begin(&uprobes_seqcount); + uprobes = __find_uprobe(inode, offset); + if (uprobes) { + /* + * Lockless RB-tree lookups are prone to false-negatives. + * If they find something, it's good. If they do not find, + * it needs to be validated. + */ + return uprobes; + } + } while (read_seqcount_retry(&uprobes_seqcount, seq)); + + /* Really didn't find anything. */ + return NULL; } static struct uprobe *__insert_uprobe(struct uprobe *uprobe) @@ -702,7 +724,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) struct uprobe *u; write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); + write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; @@ -936,7 +960,9 @@ static void delete_uprobe(struct uprobe *uprobe) return; write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); rb_erase(&uprobe->rb_node, &uprobes_tree); + write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ put_uprobe(uprobe);