On Sun, Oct 23, 2016 at 06:57:26PM -0700, Davidlohr Bueso wrote: > On Wed, 19 Oct 2016, Peter Zijlstra wrote: > > > Subject: sched: Better explain sleep/wakeup > > From: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > > Date: Wed Oct 19 15:45:27 CEST 2016 > > > > There were a few questions wrt how sleep-wakeup works. Try and explain > > it more. > > > > Requested-by: Will Deacon <will.deacon@xxxxxxx> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> > > --- > > include/linux/sched.h | 52 ++++++++++++++++++++++++++++++++++---------------- > > kernel/sched/core.c | 15 +++++++------- > > 2 files changed, 44 insertions(+), 23 deletions(-) > > > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*! > > #define set_task_state(tsk, state_value) \ > > do { \ > > (tsk)->task_state_change = _THIS_IP_; \ > > - smp_store_mb((tsk)->state, (state_value)); \ > > + smp_store_mb((tsk)->state, (state_value)); \ > > } while (0) > > > > -/* > > - * set_current_state() includes a barrier so that the write of current->state > > - * is correctly serialised wrt the caller's subsequent test of whether to > > - * actually sleep: > > - * > > - * set_current_state(TASK_UNINTERRUPTIBLE); > > - * if (do_i_need_to_sleep()) > > - * schedule(); > > - * > > - * If the caller does not need such serialisation then use __set_current_state() > > - */ > > #define __set_current_state(state_value) \ > > do { \ > > current->task_state_change = _THIS_IP_; \ > > @@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*! > > #define set_current_state(state_value) \ > > do { \ > > current->task_state_change = _THIS_IP_; \ > > - smp_store_mb(current->state, (state_value)); \ > > + smp_store_mb(current->state, (state_value)); \ > > } while (0) > > > > #else > > > > +/* > > + * @tsk had better be current, or you get to keep the pieces. > > That reminds me we were getting rid of the set_task_state() calls. Bcache was > pending, being only user in the kernel that doesn't actually use current; but > instead breaks newly (yet blocked/uninterruptible) created garbage collection > kthread. I cannot figure out why this is done (ie purposely accounting the > load avg. Furthermore gc kicks in in very specific scenarios obviously, such > as as by the allocator task, so I don't see why bcache gc should want to be > interruptible. > > Kent, Jens, can we get rid of this? Here's a patch that just fixes the way gc gets woken up. Eric, you want to take this? -- >8 -- Subject: [PATCH] bcache: Make gc wakeup saner This lets us ditch a set_task_state() call. Signed-off-by: Kent Overstreet <kent.overstreet@xxxxxxxxx> --- drivers/md/bcache/bcache.h | 4 ++-- drivers/md/bcache/btree.c | 39 ++++++++++++++++++++------------------- drivers/md/bcache/btree.h | 3 +-- drivers/md/bcache/request.c | 4 +--- drivers/md/bcache/super.c | 2 ++ 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6b420a55c7..c3ea03c9a1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -425,7 +425,7 @@ struct cache { * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ - unsigned invalidate_needs_gc:1; + unsigned invalidate_needs_gc; bool discard; /* Get rid of? */ @@ -593,8 +593,8 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; + wait_queue_head_t gc_wait; - wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81d3db40cd..2efdce0724 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1757,32 +1757,34 @@ static void bch_btree_gc(struct cache_set *c) bch_moving_gc(c); } -static int bch_gc_thread(void *arg) +static bool gc_should_run(struct cache_set *c) { - struct cache_set *c = arg; struct cache *ca; unsigned i; - while (1) { -again: - bch_btree_gc(c); + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) + return true; - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; + if (atomic_read(&c->sectors_to_gc) < 0) + return true; - mutex_lock(&c->bucket_lock); + return false; +} - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) { - mutex_unlock(&c->bucket_lock); - set_current_state(TASK_RUNNING); - goto again; - } +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; - mutex_unlock(&c->bucket_lock); + while (1) { + wait_event_interruptible(c->gc_wait, + kthread_should_stop() || gc_should_run(c)); - schedule(); + if (kthread_should_stop()) + break; + + set_gc_sectors(c); + bch_btree_gc(c); } return 0; @@ -1790,11 +1792,10 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { - c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); if (IS_ERR(c->gc_thread)) return PTR_ERR(c->gc_thread); - set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); return 0; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 5c391fa01b..9b80417cd5 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -260,8 +260,7 @@ void bch_initial_mark_key(struct cache_set *, int, struct bkey *); static inline void wake_up_gc(struct cache_set *c) { - if (c->gc_thread) - wake_up_process(c->gc_thread); + wake_up(&c->gc_wait); } #define MAP_DONE 0 diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 40ffe5e424..a37c1776f2 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,10 +196,8 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { - set_gc_sectors(op->c); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) wake_up_gc(op->c); - } if (op->bypass) return bch_data_invalidate(cl); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 849ad441cd..66669c8f41 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1491,6 +1491,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); init_waitqueue_head(&c->bucket_wait); + init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); spin_lock_init(&c->btree_gc_time.lock); @@ -1550,6 +1551,7 @@ static void run_cache_set(struct cache_set *c) for_each_cache(ca, c, i) c->nbuckets += ca->sb.nbuckets; + set_gc_sectors(c); if (CACHE_SYNC(&c->sb)) { LIST_HEAD(journal); -- 2.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-bcache" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html