On 10/12/21 11:17 AM, Jens Axboe wrote:
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, + int *tags, int nr_tags) +{ + struct sbitmap *sb = &sbq->sb; + unsigned long *addr = NULL; + unsigned long mask = 0; + int i; + + smp_mb__before_atomic(); + for (i = 0; i < nr_tags; i++) { + const int tag = tags[i] - offset; + unsigned long *this_addr; + + /* since we're clearing a batch, skip the deferred map */ + this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; + if (!addr) { + addr = this_addr; + } else if (addr != this_addr) { + atomic_long_andnot(mask, (atomic_long_t *) addr); + mask = 0; + addr = this_addr; + } + mask |= (1UL << SB_NR_TO_BIT(sb, tag)); + } + + if (mask) + atomic_long_andnot(mask, (atomic_long_t *) addr); + + smp_mb__after_atomic(); + sbitmap_queue_wake_up(sbq); + sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), + tags[nr_tags - 1] - offset); +} + void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) {
How does replacing the sbitmap_queue_clear() implementation by calling sbitmap_queue_clear_batch() affect performance? I'm wondering whether it is possible to prevent code duplication without affecting performance negatively.
Thanks, Bart.