This is a development of "percpu_ida" library. The major differrences between "percpu_ida" and "percpu_tags" are: * The global freelist has gone. As result, CPUs do not compete for the global lock. * Long-running operatons (scanning thru a cpumask) are executed with local interrupts enabled; * percpu_ida::percpu_max_size limit is eliminated. Instead, the limit is determined dynamically. Depending from how many CPUs are requesting tags each CPU gets a fair share of the tag space; * A tag is attempted to return to the CPU it was allocated on. As result, it is expected the locality of data associated with the tag benefits; Signed-off-by: Alexander Gordeev <agordeev@xxxxxxxxxx> Cc: linux-scsi@xxxxxxxxxxxxxxx Cc: qla2xxx-upstream@xxxxxxxxxx Cc: Nicholas Bellinger <nab@xxxxxxxxxxxxx> Cc: Kent Overstreet <kmo@xxxxxxxxxxxxx> Cc: "Michael S. Tsirkin" <mst@xxxxxxxxxx> --- include/linux/percpu_tags.h | 37 +++ lib/Makefile | 2 +- lib/percpu_tags.c | 556 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 594 insertions(+), 1 deletions(-) create mode 100644 include/linux/percpu_tags.h create mode 100644 lib/percpu_tags.c diff --git a/include/linux/percpu_tags.h b/include/linux/percpu_tags.h new file mode 100644 index 0000000..ee89863 --- /dev/null +++ b/include/linux/percpu_tags.h @@ -0,0 +1,37 @@ +#ifndef __PERCPU_TAGS_H__ +#define __PERCPU_TAGS_H__ + +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/spinlock_types.h> +#include <linux/wait.h> +#include <linux/cpumask.h> + +struct percpu_cache; + +struct percpu_tags { + int nr_tags; + struct percpu_cache __percpu *cache; + + int *tag_cpu_map; + + cpumask_t alloc_tags; + cpumask_t free_tags; + cpumask_t wait_tags; +}; + +int percpu_tags_alloc(struct percpu_tags *pt, int state); +void percpu_tags_free(struct percpu_tags *pt, int tag); + +int percpu_tags_init(struct percpu_tags *pt, int nr_tags); +void percpu_tags_destroy(struct percpu_tags *pt); + +int percpu_tags_for_each_free(struct percpu_tags *pt, + int (*fn)(unsigned, void *), void *data); +int percpu_tags_free_tags(struct percpu_tags *pt, int cpu); + +#define PERCPU_TAGS_BATCH_MAX 4 + +#endif /* __PERCPU_TAGS_H__ */ diff --git a/lib/Makefile b/lib/Makefile index ba967a1..671143f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o percpu_ida.o hash.o + percpu-refcount.o percpu_ida.o percpu_tags.o hash.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += kstrtox.o diff --git a/lib/percpu_tags.c b/lib/percpu_tags.c new file mode 100644 index 0000000..7bb61f5 --- /dev/null +++ b/lib/percpu_tags.c @@ -0,0 +1,556 @@ +/* + * Percpu tags library, based on percpu IDA library + * + * Copyright (C) 2014 RedHat, Inc. Alexander Gordeev + * Copyright (C) 2013 Datera, Inc. Kent Overstreet + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/bug.h> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/hardirq.h> +#include <linux/idr.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/percpu_tags.h> + +struct percpu_cache { + spinlock_t lock; + int cpu; /* CPU this cache belongs to */ + wait_queue_head_t wait; /* tasks waiting for a tag */ + + int nr_alloc; /* nr of allocated tags */ + + int nr_free; /* nr of unallocated tags */ + int freelist[]; +}; + +#define spin_lock_irqsave_cond(lock, cpu, flags) \ +do { \ + preempt_disable(); \ + if ((cpu) == smp_processor_id()) \ + local_irq_save(flags); \ + spin_lock(lock); \ + preempt_enable(); \ +} while (0) + +#define spin_unlock_irqrestore_cond(lock, cpu, flags) \ +do { \ + int __this_cpu = smp_processor_id(); \ + spin_unlock(lock); \ + if (cpu == __this_cpu) \ + local_irq_restore(flags); \ +} while (0) + +#define double_spin_lock_irqsave_cond(lock1, cpu1, lock2, cpu2, flags) \ +do { \ + spinlock_t *__lock1 = (lock1); \ + spinlock_t *__lock2 = (lock2); \ + int __this_cpu; \ + \ + if (__lock1 > __lock2) \ + swap(__lock1, __lock2); \ + \ + preempt_disable(); \ + __this_cpu = smp_processor_id(); \ + if ((cpu1) == __this_cpu || (cpu2) == __this_cpu) \ + local_irq_save(flags); \ + spin_lock(__lock1); \ + spin_lock_nested(__lock2, SINGLE_DEPTH_NESTING); \ + preempt_enable(); \ +} while (0) + +static inline void cpu_to_tag(struct percpu_tags *pt, int cpu, int tag) +{ + smp_rmb(); + if (pt->tag_cpu_map[tag] != cpu) { + pt->tag_cpu_map[tag] = cpu; + smp_wmb(); + } +} + +static inline int cpu_from_tag(struct percpu_tags *pt, int tag) +{ + smp_rmb(); + return pt->tag_cpu_map[tag]; +} + +static void move_tags(int *dst, int *nr_dst, int *src, int *nr_src, size_t nr) +{ + *nr_src -= nr; + memcpy(dst + *nr_dst, src + *nr_src, sizeof(*dst) * nr); + *nr_dst += nr; +} + +static int batch_size(int nr_cache) +{ + return max(1, min(PERCPU_TAGS_BATCH_MAX, nr_cache / 4)); +} + +static int cache_size(struct percpu_tags *pt) +{ + int weight; + + /* + * Bitmask percpu_tags::alloc_tags is used to indicate the number + * of currently active CPUs, although it is unlikely reflects a + * CPU activity reliably - we need a better heuristic here. + */ + smp_rmb(); + weight = cpumask_weight(&pt->alloc_tags); + + if (weight) + return pt->nr_tags / weight; + else + return pt->nr_tags; +} + +static int alloc_tag_local(struct percpu_tags *pt) +{ + bool scarce = pt->nr_tags < cpumask_weight(cpu_online_mask); + int nr_cache = cache_size(pt); + struct percpu_cache *tags = this_cpu_ptr(pt->cache); + int cpu = tags->cpu; + unsigned long uninitialized_var(flags); + int tag; + + spin_lock_irqsave_cond(&tags->lock, cpu, flags); + + if (!tags->nr_free || + (!scarce && (tags->nr_free + tags->nr_alloc > nr_cache))) { + spin_unlock_irqrestore_cond(&tags->lock, cpu, flags); + return -ENOSPC; + } + + tags->nr_alloc++; + tags->nr_free--; + + tag = tags->freelist[tags->nr_free]; + + if (!tags->nr_free) + cpumask_clear_cpu(cpu, &pt->free_tags); + cpumask_set_cpu(cpu, &pt->alloc_tags); + + spin_unlock_irqrestore_cond(&tags->lock, cpu, flags); + + cpu_to_tag(pt, cpu, tag); + + return tag; +} + +static int pull_tag(struct percpu_tags *pt, + struct percpu_cache *dtags, struct percpu_cache *stags, + bool force) +{ + int nr_cache = cache_size(pt); + int nr_batch = batch_size(nr_cache); + int nr_pull; + unsigned long uninitialized_var(flags); + int tag; + + double_spin_lock_irqsave_cond(&stags->lock, stags->cpu, + &dtags->lock, dtags->cpu, flags); + + if (force) { + nr_pull = min(nr_batch, stags->nr_free); + } else { + nr_pull = stags->nr_free + stags->nr_alloc - nr_cache; + nr_pull = min(nr_pull, stags->nr_free); + nr_pull = min(nr_pull, nr_batch); + } + + if (nr_pull < 1) { + spin_unlock_irqrestore_cond(&dtags->lock, dtags->cpu, flags); + spin_unlock_irqrestore_cond(&stags->lock, stags->cpu, flags); + return -ENOSPC; + } + + move_tags(dtags->freelist, &dtags->nr_free, + stags->freelist, &stags->nr_free, nr_pull); + + dtags->nr_alloc++; + dtags->nr_free--; + + tag = dtags->freelist[dtags->nr_free]; + + if (dtags->nr_free) + cpumask_set_cpu(dtags->cpu, &pt->free_tags); + + spin_unlock_irqrestore_cond(&dtags->lock, dtags->cpu, flags); + + if (!stags->nr_free) + cpumask_clear_cpu(stags->cpu, &pt->free_tags); + + spin_unlock_irqrestore_cond(&stags->lock, stags->cpu, flags); + + cpu_to_tag(pt, dtags->cpu, tag); + + return tag; +} + +wait_queue_head_t * +prepare_to_wait_tag(struct percpu_tags *pt, wait_queue_t *wait, int state) +{ + struct percpu_cache *tags; + unsigned long flags; + + local_irq_save(flags); + tags = this_cpu_ptr(pt->cache); + spin_lock(&tags->lock); + + prepare_to_wait(&tags->wait, wait, state); + cpumask_set_cpu(tags->cpu, &pt->wait_tags); + + local_irq_restore(flags); + spin_unlock(&tags->lock); + + return &tags->wait; +} + +static int +__alloc_tag_global(struct percpu_tags *pt, + int start_cpu, const cpumask_t *cpumask, bool force) +{ + struct percpu_cache *this_tags = per_cpu_ptr(pt->cache, start_cpu); + struct percpu_cache *remote_tags; + int cpu = start_cpu; + int n; + int tag = -ENOSPC; + + for (n = cpumask_weight(cpumask); n; n--) { + cpu = cpumask_next(cpu, cpumask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpumask); + if (cpu >= nr_cpu_ids) + break; + + if (cpu == start_cpu) + continue; + + remote_tags = per_cpu_ptr(pt->cache, cpu); + tag = pull_tag(pt, this_tags, remote_tags, force); + if (tag >= 0) + break; + } + + return tag; +} + +static int alloc_tag_global(struct percpu_tags *pt) +{ + int this_cpu = smp_processor_id(); + int tag; + + tag = __alloc_tag_global(pt, this_cpu, &pt->free_tags, false); + if (tag < 0) + tag = __alloc_tag_global(pt, this_cpu, &pt->free_tags, true); + + return tag; +} + +int percpu_tags_alloc(struct percpu_tags *pt, int state) +{ + DEFINE_WAIT(wait); + wait_queue_head_t *wq = NULL; + int tag = -ENOSPC; + + for (;;) { + tag = alloc_tag_local(pt); + if (tag >= 0) + break; + + tag = alloc_tag_global(pt); + if (tag >= 0) + break; + + if (state == TASK_RUNNING) + break; + + wq = prepare_to_wait_tag(pt, &wait, state); + + schedule(); + + if (signal_pending_state(state, current)) { + tag = -ERESTARTSYS; + break; + } + } + + if (wq) + finish_wait(wq, &wait); + + return tag; +} +EXPORT_SYMBOL_GPL(percpu_tags_alloc); + +static bool __free_tag(struct percpu_tags *pt, + struct percpu_cache *tags, int tag, bool force) +{ + int nr_cache = cache_size(pt); + int cpu = tags->cpu; + unsigned long uninitialized_var(flags); + bool ret; + + spin_lock_irqsave_cond(&tags->lock, cpu, flags); + + BUG_ON(tags->nr_free >= pt->nr_tags); + if (force || (tags->nr_free + tags->nr_alloc < nr_cache)) { + tags->freelist[tags->nr_free] = tag; + tags->nr_free++; + cpumask_set_cpu(cpu, &pt->free_tags); + ret = true; + } else { + ret = false; + } + + spin_unlock_irqrestore_cond(&tags->lock, cpu, flags); + + return ret; +} + +static int free_tag(struct percpu_tags *pt, struct percpu_cache *tags, int tag) +{ + return __free_tag(pt, tags, tag, false); +} + +static int push_tag(struct percpu_tags *pt, struct percpu_cache *tags, int tag) +{ + return __free_tag(pt, tags, tag, true); +} + +static void dealloc_tag(struct percpu_tags *pt, int cpu, int tag) +{ + struct percpu_cache *tags = per_cpu_ptr(pt->cache, cpu); + unsigned long uninitialized_var(flags); + + spin_lock_irqsave_cond(&tags->lock, cpu, flags); + + BUG_ON(tags->nr_alloc < 1); + tags->nr_alloc--; + if (!tags->nr_alloc) + cpumask_clear_cpu(tags->cpu, &pt->alloc_tags); + + spin_unlock_irqrestore_cond(&tags->lock, cpu, flags); +} + +static int free_tag_scarce(struct percpu_tags *pt, int start_cpu, int tag) +{ + struct percpu_cache *tags; + int cpu; + + cpu = cpumask_next(start_cpu, &pt->wait_tags); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(&pt->wait_tags); + + if (cpu >= nr_cpu_ids) + cpu = cpumask_next(start_cpu, &pt->alloc_tags); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(&pt->alloc_tags); + + if (cpu >= nr_cpu_ids) + cpu = start_cpu; + + tags = per_cpu_ptr(pt->cache, cpu); + push_tag(pt, tags, tag); + + return cpu; +} + +static int __free_tag_normal(struct percpu_tags *pt, + int start_cpu, int tag, const cpumask_t *cpumask) +{ + struct percpu_cache *tags; + int cpu; + int n; + + tags = per_cpu_ptr(pt->cache, start_cpu); + if (free_tag(pt, tags, tag)) + return start_cpu; + + cpu = nr_cpu_ids; + + for (n = cpumask_weight(cpumask); n; n--) { + cpu = cpumask_next(cpu, cpumask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpumask); + if (cpu >= nr_cpu_ids) + break; + + if (cpu == start_cpu) + continue; + + tags = per_cpu_ptr(pt->cache, cpu); + if (free_tag(pt, tags, tag)) + break; + } + + return cpu; +} + +static int free_tag_normal(struct percpu_tags *pt, int start_cpu, int tag) +{ + struct percpu_cache *tags; + int cpu; + + cpu = __free_tag_normal(pt, start_cpu, tag, &pt->alloc_tags); + + if (cpu >= nr_cpu_ids) + cpu = __free_tag_normal(pt, start_cpu, tag, &pt->wait_tags); + + if (cpu >= nr_cpu_ids) { + cpu = start_cpu; + tags = per_cpu_ptr(pt->cache, cpu); + push_tag(pt, tags, tag); + } + + return cpu; +} + +static bool wake_on_cpu(struct percpu_tags *pt, int cpu) +{ + struct percpu_cache *tags = per_cpu_ptr(pt->cache, cpu); + int wq_active; + unsigned long uninitialized_var(flags); + + spin_lock_irqsave_cond(&tags->lock, cpu, flags); + wq_active = waitqueue_active(&tags->wait); + if (!wq_active) + cpumask_clear_cpu(cpu, &pt->wait_tags); + spin_unlock_irqrestore_cond(&tags->lock, cpu, flags); + + if (wq_active) + wake_up(&tags->wait); + + return wq_active; +} + +void percpu_tags_free(struct percpu_tags *pt, int tag) +{ + bool scarce = pt->nr_tags < cpumask_weight(cpu_online_mask); + int cpu, alloc_cpu; + int n; + + alloc_cpu = cpu_from_tag(pt, tag); + dealloc_tag(pt, alloc_cpu, tag); + + if (scarce) + cpu = free_tag_scarce(pt, alloc_cpu, tag); + else + cpu = free_tag_normal(pt, alloc_cpu, tag); + + if (wake_on_cpu(pt, cpu)) + return; + + for (n = cpumask_weight(&pt->wait_tags); n; n--) { + cpu = cpumask_next(cpu, &pt->wait_tags); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(&pt->wait_tags); + if (cpu >= nr_cpu_ids) + break; + + if (wake_on_cpu(pt, cpu)) + break; + } +} +EXPORT_SYMBOL_GPL(percpu_tags_free); + +int percpu_tags_init(struct percpu_tags *pt, int nr_tags) +{ + struct percpu_cache *tags; + int order; + int cpu; + int i; + + order = get_order(nr_tags * sizeof(pt->tag_cpu_map[0])); + pt->tag_cpu_map = (int*)__get_free_pages(GFP_KERNEL, order); + if (!pt->tag_cpu_map) + return -ENOMEM; + + pt->cache = __alloc_percpu(sizeof(*pt->cache) + + nr_tags * sizeof(pt->cache->freelist[0]), + sizeof(pt->cache->freelist[0])); + if (!pt->cache) { + free_pages((unsigned long)pt->tag_cpu_map, order); + return -ENOMEM; + } + + pt->nr_tags = nr_tags; + + for_each_possible_cpu(cpu) { + tags = per_cpu_ptr(pt->cache, cpu); + tags->cpu = cpu; + spin_lock_init(&tags->lock); + init_waitqueue_head(&tags->wait); + } + + tags = this_cpu_ptr(pt->cache); + for (i = 0; i < nr_tags; i++) + tags->freelist[i] = i; + tags->nr_free = nr_tags; + cpumask_set_cpu(tags->cpu, &pt->free_tags); + + return 0; +} +EXPORT_SYMBOL_GPL(percpu_tags_init); + +void percpu_tags_destroy(struct percpu_tags *pt) +{ + int order = get_order(pt->nr_tags * sizeof(pt->tag_cpu_map[0])); + free_pages((unsigned long)pt->tag_cpu_map, order); + free_percpu(pt->cache); +} +EXPORT_SYMBOL_GPL(percpu_tags_destroy); + +int percpu_tags_for_each_free(struct percpu_tags *pt, + int (*fn)(unsigned, void *), void *data) +{ + unsigned long flags; + struct percpu_cache *remote; + unsigned cpu, i, err = 0; + + local_irq_save(flags); + for_each_possible_cpu(cpu) { + remote = per_cpu_ptr(pt->cache, cpu); + spin_lock(&remote->lock); + for (i = 0; i < remote->nr_free; i++) { + err = fn(remote->freelist[i], data); + if (err) + break; + } + spin_unlock(&remote->lock); + if (err) + goto out; + } + +out: + local_irq_restore(flags); + return err; +} +EXPORT_SYMBOL_GPL(percpu_tags_for_each_free); + +int percpu_tags_free_tags(struct percpu_tags *pt, int cpu) +{ + struct percpu_cache *remote; + if (cpu >= nr_cpu_ids) + return 0; + remote = per_cpu_ptr(pt->cache, cpu); + return remote->nr_free; +} +EXPORT_SYMBOL_GPL(percpu_tags_free_tags); -- 1.7.7.6 -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html