[PATCH 1/2] objpool: enable inlining objpool_push() and objpool_pop() operations

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



objpool_push() and objpool_pop() are very performance-critical functions
and can be called very frequently in kretprobe triggering path.

As such, it makes sense to allow compiler to inline them completely to
eliminate function calls overhead. Luckily, their logic is quite well
isolated and doesn't have any sprawling dependencies.

This patch moves both objpool_push() and objpool_pop() into
include/linux/objpool.h and marks them as static inline functions,
enabling inlining. To avoid anyone using internal helpers
(objpool_try_get_slot, objpool_try_add_slot), rename them to use leading
underscores.

We used kretprobe microbenchmark from BPF selftests (bench trig-kprobe
and trig-kprobe-multi benchmarks) running no-op BPF kretprobe/kretprobe.multi
programs in a tight loop to evaluate the effect. BPF own overhead in
this case is minimal and it mostly stresses the rest of in-kernel
kretprobe infrastructure overhead. Results are in millions of calls per
second. This is not super scientific, but shows the trend nevertheless.

BEFORE
======
kretprobe      :    9.794 ± 0.086M/s
kretprobe-multi:   10.219 ± 0.032M/s

AFTER
=====
kretprobe      :    9.937 ± 0.174M/s (+1.5%)
kretprobe-multi:   10.440 ± 0.108M/s (+2.2%)

Cc: Matt (Qiang) Wu <wuqiang.matt@xxxxxxxxxxxxx>
Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
---
 include/linux/objpool.h | 101 +++++++++++++++++++++++++++++++++++++++-
 lib/objpool.c           | 100 ---------------------------------------
 2 files changed, 99 insertions(+), 102 deletions(-)

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index 15aff4a17f0c..d8b1f7b91128 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -5,6 +5,10 @@
 
 #include <linux/types.h>
 #include <linux/refcount.h>
+#include <linux/atomic.h>
+#include <linux/cpumask.h>
+#include <linux/irqflags.h>
+#include <linux/smp.h>
 
 /*
  * objpool: ring-array based lockless MPMC queue
@@ -118,13 +122,94 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
 		 gfp_t gfp, void *context, objpool_init_obj_cb objinit,
 		 objpool_fini_cb release);
 
+/* try to retrieve object from slot */
+static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	/* load head snapshot, other cpus may change it */
+	uint32_t head = smp_load_acquire(&slot->head);
+
+	while (head != READ_ONCE(slot->last)) {
+		void *obj;
+
+		/*
+		 * data visibility of 'last' and 'head' could be out of
+		 * order since memory updating of 'last' and 'head' are
+		 * performed in push() and pop() independently
+		 *
+		 * before any retrieving attempts, pop() must guarantee
+		 * 'last' is behind 'head', that is to say, there must
+		 * be available objects in slot, which could be ensured
+		 * by condition 'last != head && last - head <= nr_objs'
+		 * that is equivalent to 'last - head - 1 < nr_objs' as
+		 * 'last' and 'head' are both unsigned int32
+		 */
+		if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
+			head = READ_ONCE(slot->head);
+			continue;
+		}
+
+		/* obj must be retrieved before moving forward head */
+		obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+		/* move head forward to mark it's consumption */
+		if (try_cmpxchg_release(&slot->head, &head, head + 1))
+			return obj;
+	}
+
+	return NULL;
+}
+
 /**
  * objpool_pop() - allocate an object from objpool
  * @pool: object pool
  *
  * return value: object ptr or NULL if failed
  */
-void *objpool_pop(struct objpool_head *pool);
+static inline void *objpool_pop(struct objpool_head *pool)
+{
+	void *obj = NULL;
+	unsigned long flags;
+	int i, cpu;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < num_possible_cpus(); i++) {
+		obj = __objpool_try_get_slot(pool, cpu);
+		if (obj)
+			break;
+		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+	}
+	raw_local_irq_restore(flags);
+
+	return obj;
+}
+
+/* adding object to slot, abort if the slot was already full */
+static inline int
+__objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	uint32_t head, tail;
+
+	/* loading tail and head as a local snapshot, tail first */
+	tail = READ_ONCE(slot->tail);
+
+	do {
+		head = READ_ONCE(slot->head);
+		/* fault caught: something must be wrong */
+		WARN_ON_ONCE(tail - head > pool->nr_objs);
+	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
+
+	/* now the tail position is reserved for the given obj */
+	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
+	/* update sequence to make this obj available for pop() */
+	smp_store_release(&slot->last, tail + 1);
+
+	return 0;
+}
 
 /**
  * objpool_push() - reclaim the object and return back to objpool
@@ -134,7 +219,19 @@ void *objpool_pop(struct objpool_head *pool);
  * return: 0 or error code (it fails only when user tries to push
  * the same object multiple times or wrong "objects" into objpool)
  */
-int objpool_push(void *obj, struct objpool_head *pool);
+static inline int objpool_push(void *obj, struct objpool_head *pool)
+{
+	unsigned long flags;
+	int rc;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+	rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id());
+	raw_local_irq_restore(flags);
+
+	return rc;
+}
+
 
 /**
  * objpool_drop() - discard the object and deref objpool
diff --git a/lib/objpool.c b/lib/objpool.c
index cfdc02420884..f696308fc026 100644
--- a/lib/objpool.c
+++ b/lib/objpool.c
@@ -152,106 +152,6 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
 }
 EXPORT_SYMBOL_GPL(objpool_init);
 
-/* adding object to slot, abort if the slot was already full */
-static inline int
-objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
-{
-	struct objpool_slot *slot = pool->cpu_slots[cpu];
-	uint32_t head, tail;
-
-	/* loading tail and head as a local snapshot, tail first */
-	tail = READ_ONCE(slot->tail);
-
-	do {
-		head = READ_ONCE(slot->head);
-		/* fault caught: something must be wrong */
-		WARN_ON_ONCE(tail - head > pool->nr_objs);
-	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
-
-	/* now the tail position is reserved for the given obj */
-	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
-	/* update sequence to make this obj available for pop() */
-	smp_store_release(&slot->last, tail + 1);
-
-	return 0;
-}
-
-/* reclaim an object to object pool */
-int objpool_push(void *obj, struct objpool_head *pool)
-{
-	unsigned long flags;
-	int rc;
-
-	/* disable local irq to avoid preemption & interruption */
-	raw_local_irq_save(flags);
-	rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
-	raw_local_irq_restore(flags);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(objpool_push);
-
-/* try to retrieve object from slot */
-static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
-{
-	struct objpool_slot *slot = pool->cpu_slots[cpu];
-	/* load head snapshot, other cpus may change it */
-	uint32_t head = smp_load_acquire(&slot->head);
-
-	while (head != READ_ONCE(slot->last)) {
-		void *obj;
-
-		/*
-		 * data visibility of 'last' and 'head' could be out of
-		 * order since memory updating of 'last' and 'head' are
-		 * performed in push() and pop() independently
-		 *
-		 * before any retrieving attempts, pop() must guarantee
-		 * 'last' is behind 'head', that is to say, there must
-		 * be available objects in slot, which could be ensured
-		 * by condition 'last != head && last - head <= nr_objs'
-		 * that is equivalent to 'last - head - 1 < nr_objs' as
-		 * 'last' and 'head' are both unsigned int32
-		 */
-		if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
-			head = READ_ONCE(slot->head);
-			continue;
-		}
-
-		/* obj must be retrieved before moving forward head */
-		obj = READ_ONCE(slot->entries[head & slot->mask]);
-
-		/* move head forward to mark it's consumption */
-		if (try_cmpxchg_release(&slot->head, &head, head + 1))
-			return obj;
-	}
-
-	return NULL;
-}
-
-/* allocate an object from object pool */
-void *objpool_pop(struct objpool_head *pool)
-{
-	void *obj = NULL;
-	unsigned long flags;
-	int i, cpu;
-
-	/* disable local irq to avoid preemption & interruption */
-	raw_local_irq_save(flags);
-
-	cpu = raw_smp_processor_id();
-	for (i = 0; i < num_possible_cpus(); i++) {
-		obj = objpool_try_get_slot(pool, cpu);
-		if (obj)
-			break;
-		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
-	}
-	raw_local_irq_restore(flags);
-
-	return obj;
-}
-EXPORT_SYMBOL_GPL(objpool_pop);
-
 /* release whole objpool forcely */
 void objpool_free(struct objpool_head *pool)
 {
-- 
2.43.0





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux