LGTM on first read-- I'll read it again and test in test branch. Reviewed-by: Michael Lyle <mlyle@xxxxxxxx> On 01/26/2018 12:24 AM, tang.junhui@xxxxxxxxxx wrote: > From: Tang Junhui <tang.junhui@xxxxxxxxxx> > > After long time small writing I/O running, we found the occupancy of CPU > is very high and I/O performance has been reduced by about half: > > [root@ceph151 internal]# top > top - 15:51:05 up 1 day,2:43, 4 users, load average: 16.89, 15.15, 16.53 > Tasks: 2063 total, 4 running, 2059 sleeping, 0 stopped, 0 zombie > %Cpu(s):4.3 us, 17.1 sy 0.0 ni, 66.1 id, 12.0 wa, 0.0 hi, 0.5 si, 0.0 st > KiB Mem : 65450044 total, 24586420 free, 38909008 used, 1954616 buff/cache > KiB Swap: 65667068 total, 65667068 free, 0 used. 25136812 avail Mem > > PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND > 2023 root 20 0 0 0 0 S 55.1 0.0 0:04.42 kworker/11:191 > 14126 root 20 0 0 0 0 S 42.9 0.0 0:08.72 kworker/10:3 > 9292 root 20 0 0 0 0 S 30.4 0.0 1:10.99 kworker/6:1 > 8553 ceph 20 0 4242492 1.805g 18804 S 30.0 2.9 410:07.04 ceph-osd > 12287 root 20 0 0 0 0 S 26.7 0.0 0:28.13 kworker/7:85 > 31019 root 20 0 0 0 0 S 26.1 0.0 1:30.79 kworker/22:1 > 1787 root 20 0 0 0 0 R 25.7 0.0 5:18.45 kworker/8:7 > 32169 root 20 0 0 0 0 S 14.5 0.0 1:01.92 kworker/23:1 > 21476 root 20 0 0 0 0 S 13.9 0.0 0:05.09 kworker/1:54 > 2204 root 20 0 0 0 0 S 12.5 0.0 1:25.17 kworker/9:10 > 16994 root 20 0 0 0 0 S 12.2 0.0 0:06.27 kworker/5:106 > 15714 root 20 0 0 0 0 R 10.9 0.0 0:01.85 kworker/19:2 > 9661 ceph 20 0 4246876 1.731g 18800 S 10.6 2.8 403:00.80 ceph-osd > 11460 ceph 20 0 4164692 2.206g 18876 S 10.6 3.5 360:27.19 ceph-osd > 9960 root 20 0 0 0 0 S 10.2 0.0 0:02.75 kworker/2:139 > 11699 ceph 20 0 4169244 1.920g 18920 S 10.2 3.1 355:23.67 ceph-osd > 6843 ceph 20 0 4197632 1.810g 18900 S 9.6 2.9 380:08.30 ceph-osd > > The kernel work consumed a lot of CPU, and I found they are running journal > work, The journal is reclaiming source and flush btree node with surprising > frequency. > > Through further analysis, we found that in btree_flush_write(), we try to > get a btree node with the smallest fifo idex to flush by traverse all the > btree nodein c->bucket_hash, after we getting it, since no locker protects > it, this btree node may have been written to cache device by other works, > and if this occurred, we retry to traverse in c->bucket_hash and get > another btree node. When the problem occurrd, the retry times is very high, > and we consume a lot of CPU in looking for a appropriate btree node. > > In this patch, we try to record 128 btree nodes with the smallest fifo idex > in heap, and pop one by one when we need to flush btree node. It greatly > reduces the time for the loop to find the appropriate BTREE node, and also > reduce the occupancy of CPU. > > Signed-off-by: Tang Junhui <tang.junhui@xxxxxxxxxx> > --- > drivers/md/bcache/bcache.h | 2 ++ > drivers/md/bcache/journal.c | 47 ++++++++++++++++++++++++++++++--------------- > 2 files changed, 34 insertions(+), 15 deletions(-) > > diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h > index 0432e28..b343ba4 100644 > --- a/drivers/md/bcache/bcache.h > +++ b/drivers/md/bcache/bcache.h > @@ -669,6 +669,8 @@ struct cache_set { > > #define BUCKET_HASH_BITS 12 > struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; > + > + DECLARE_HEAP(struct btree *, flush_btree); > }; > > struct bbio { > diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c > index 47fd0b8..f42d3ea 100644 > --- a/drivers/md/bcache/journal.c > +++ b/drivers/md/bcache/journal.c > @@ -363,6 +363,12 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) > } > > /* Journalling */ > +#define journal_max_cmp(l, r) \ > + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ > + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) > +#define journal_min_cmp(l, r) \ > + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ > + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) > > static void btree_flush_write(struct cache_set *c) > { > @@ -370,25 +376,35 @@ static void btree_flush_write(struct cache_set *c) > * Try to find the btree node with that references the oldest journal > * entry, best is our current candidate and is locked if non NULL: > */ > - struct btree *b, *best; > - unsigned i; > + struct btree *b; > + int i; > > atomic_long_inc(&c->flush_write); > + > retry: > - best = NULL; > - > - for_each_cached_btree(b, c, i) > - if (btree_current_write(b)->journal) { > - if (!best) > - best = b; > - else if (journal_pin_cmp(c, > - btree_current_write(best)->journal, > - btree_current_write(b)->journal)) { > - best = b; > + spin_lock(&c->journal.lock); > + if (heap_empty(&c->flush_btree)) { > + for_each_cached_btree(b, c, i) > + if (btree_current_write(b)->journal) { > + if (!heap_full(&c->flush_btree)) > + heap_add(&c->flush_btree, b, > + journal_max_cmp); > + else if (journal_max_cmp(b, > + heap_peek(&c->flush_btree))) { > + c->flush_btree.data[0] = b; > + heap_sift(&c->flush_btree, 0, > + journal_max_cmp); > + } > } > - } > > - b = best; > + for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) > + heap_sift(&c->flush_btree, i, journal_min_cmp); > + } > + > + b = NULL; > + heap_pop(&c->flush_btree, b, journal_min_cmp); > + spin_unlock(&c->journal.lock); > + > if (b) { > mutex_lock(&b->write_lock); > if (!btree_current_write(b)->journal) { > @@ -819,7 +835,8 @@ int bch_journal_alloc(struct cache_set *c) > j->w[0].c = c; > j->w[1].c = c; > > - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || > + if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || > + !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || > !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || > !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) > return -ENOMEM; >