Keep track of the time the thread at the head of the secondary queue has been waiting, and force inter-node handoff once this time passes a preset threshold. The default value for the threshold (1ms) can be overridden with the new kernel boot command-line option "qspinlock.numa_spinlock_threshold_ns". Signed-off-by: Alex Kogan <alex.kogan@xxxxxxxxxx> Reviewed-by: Steve Sistare <steven.sistare@xxxxxxxxxx> Reviewed-by: Waiman Long <longman@xxxxxxxxxx> --- .../admin-guide/kernel-parameters.txt | 8 ++ kernel/locking/qspinlock_cna.h | 81 +++++++++++++++---- 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 94d35507560c..e2bcf6f4e048 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4183,6 +4183,14 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number. + qspinlock.numa_spinlock_threshold_ns= [NUMA, PV_OPS] + Set the time threshold in nanoseconds for the + number of intra-node lock hand-offs before the + NUMA-aware spinlock is forced to be passed to + a thread on another NUMA node. Smaller values + result in a more fair, but less performant spinlock, + and vice versa. The default value is 1000000 (=1ms). + quiet [KNL] Disable most log messages r128= [HW,DRM] diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h index ca564e64e5de..0b991c340fb1 100644 --- a/kernel/locking/qspinlock_cna.h +++ b/kernel/locking/qspinlock_cna.h @@ -4,6 +4,8 @@ #endif #include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/moduleparam.h> /* * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). @@ -37,19 +39,39 @@ * gradually filter the primary queue, leaving only waiters running on the same * preferred NUMA node. * + * We change the NUMA node preference after a waiter at the head of the + * secondary queue spins for a certain amount of time (1ms, by default). + * We do that by flushing the secondary queue into the head of the primary queue, + * effectively changing the preference to the NUMA node of the waiter at the head + * of the secondary queue at the time of the flush. + * * For more details, see https://arxiv.org/abs/1810.05600. * * Authors: Alex Kogan <alex.kogan@xxxxxxxxxx> * Dave Dice <dave.dice@xxxxxxxxxx> */ +#define FLUSH_SECONDARY_QUEUE 1 + struct cna_node { struct mcs_spinlock mcs; u16 numa_node; u16 real_numa_node; u32 encoded_tail; /* self */ + u64 start_time; }; +static ulong numa_spinlock_threshold_ns = 1000000; /* 1ms, by default */ +module_param(numa_spinlock_threshold_ns, ulong, 0644); + +static inline bool intra_node_threshold_reached(struct cna_node *cn) +{ + u64 current_time = local_clock(); + u64 threshold = cn->start_time + numa_spinlock_threshold_ns; + + return current_time > threshold; +} + static void __init cna_init_nodes_per_cpu(unsigned int cpu) { struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu); @@ -92,6 +114,7 @@ static __always_inline void cna_init_node(struct mcs_spinlock *node) struct cna_node *cn = (struct cna_node *)node; cn->numa_node = cn->real_numa_node; + cn->start_time = 0; } /* @@ -191,8 +214,14 @@ static void cna_splice_next(struct mcs_spinlock *node, /* stick `next` on the secondary queue tail */ if (node->locked <= 1) { /* if secondary queue is empty */ + struct cna_node *cn = (struct cna_node *)node; + /* create secondary queue */ next->next = next; + + cn->start_time = local_clock(); + /* secondary queue is not empty iff start_time != 0 */ + WARN_ON(!cn->start_time); } else { /* add to the tail of the secondary queue */ struct mcs_spinlock *tail_2nd = decode_tail(node->locked); @@ -240,12 +269,18 @@ static int cna_order_queue(struct mcs_spinlock *node) static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { - /* - * Try and put the time otherwise spent spin waiting on - * _Q_LOCKED_PENDING_MASK to use by sorting our lists. - */ - while (LOCK_IS_BUSY(lock) && !cna_order_queue(node)) - cpu_relax(); + struct cna_node *cn = (struct cna_node *)node; + + if (!cn->start_time || !intra_node_threshold_reached(cn)) { + /* + * Try and put the time otherwise spent spin waiting on + * _Q_LOCKED_PENDING_MASK to use by sorting our lists. + */ + while (LOCK_IS_BUSY(lock) && !cna_order_queue(node)) + cpu_relax(); + } else { + cn->start_time = FLUSH_SECONDARY_QUEUE; + } return 0; /* we lied; we didn't wait, go do so now */ } @@ -253,24 +288,38 @@ static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, static inline void cna_lock_handoff(struct mcs_spinlock *node, struct mcs_spinlock *next) { + struct cna_node *cn = (struct cna_node *)node; u32 val = 1; - if (node->locked > 1) { - struct cna_node *cn = (struct cna_node *)node; + if (cn->start_time != FLUSH_SECONDARY_QUEUE) { + if (node->locked > 1) { + val = node->locked; /* preseve secondary queue */ + + /* + * We have a local waiter, either real or fake one; + * reload @next in case it was changed by cna_order_queue(). + */ + next = node->next; - val = node->locked; /* preseve secondary queue */ + /* + * Pass over NUMA node id of primary queue, to maintain the + * preference even if the next waiter is on a different node. + */ + ((struct cna_node *)next)->numa_node = cn->numa_node; + ((struct cna_node *)next)->start_time = cn->start_time; + } + } else { /* - * We have a local waiter, either real or fake one; - * reload @next in case it was changed by cna_order_queue(). + * We decided to flush the secondary queue; + * this can only happen if that queue is not empty. */ - next = node->next; - + WARN_ON(node->locked <= 1); /* - * Pass over NUMA node id of primary queue, to maintain the - * preference even if the next waiter is on a different node. + * Splice the secondary queue onto the primary queue and pass the lock + * to the longest waiting remote waiter. */ - ((struct cna_node *)next)->numa_node = cn->numa_node; + next = cna_splice_head(NULL, 0, node, next); } arch_mcs_lock_handoff(&next->locked, val); -- 2.24.3 (Apple Git-128)