On Thu, Dec 23, 2010 at 4:36 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: > On Wed, 2010-12-22 at 18:08 +0100, Peter Zijlstra wrote: >> On Wed, 2010-12-22 at 16:14 +0100, Miklos Vajna wrote: >> > kernel-build is a git build using the config I already sent and after a >> > 'git checkout v2.6.36'. I can try to build master as well, so far what I >> > saw is that the bug occurs there as well, but less frequently, so maybe >> > that's a bit harder to debug. >> >> Right, so I've got v2.6.36 exploding, current -git and -tip won't >> explode for me, tried both about 20 times. So I'll try and figure out >> wth makes .36 explode and then see if further kernel still suffer that >> problem. > > OK, so something goes wrong with load accounting, > > Full log at: http://programming.kicks-ass.net/sekrit/log4 > > I print the entire runqueue before and after each op, the last good one > is a post-print: > > kworker/-9 Â Â Â 0d..2. 2013644us : pick_next_task_fair: picked: f6a8de5c, kworker/u:1/1251 > kworker/-9 Â Â Â 0d..2. 2013644us : print_runqueue <-pick_next_task_fair > kworker/-9 Â Â Â 0d..2. 2013645us : __print_runqueue: Âcfs_rq: c2407c34, nr: 3, load: 3072 > kworker/-9 Â Â Â 0d..2. 2013645us : __print_runqueue: Âcurr: f6a8de5c, comm: kworker/u:1/1251, load: 1024 > kworker/-9 Â Â Â 0d..2. 2013646us : __print_runqueue: Âse: f69e6300, load: 1024, > kworker/-9 Â Â Â 0d..2. 2013646us : __print_runqueue: Â Âcfs_rq: f69e6540, nr: 2, load: 2048 > kworker/-9 Â Â Â 0d..2. 2013646us : __print_runqueue: Â Âcurr: (null) > kworker/-9 Â Â Â 0d..2. 2013647us : __print_runqueue: Â Âse: f69e65a0, load: 1024, > kworker/-9 Â Â Â 0d..2. 2013647us : __print_runqueue: Â Â Âcfs_rq: f69e6240, nr: 1, load: 1024 > kworker/-9 Â Â Â 0d..2. 2013648us : __print_runqueue: Â Â Âcurr: (null) > kworker/-9 Â Â Â 0d..2. 2013648us : __print_runqueue: Â Â Âse: f725916c, comm: loadkeys/1225, load: 1024 > kworker/-9 Â Â Â 0d..2. 2013649us : __print_runqueue: Â Âse: f69e6660, load: 1024, > kworker/-9 Â Â Â 0d..2. 2013649us : __print_runqueue: Â Â Âcfs_rq: f69e6600, nr: 1, load: 1024 > kworker/-9 Â Â Â 0d..2. 2013650us : __print_runqueue: Â Â Âcurr: (null) > kworker/-9 Â Â Â 0d..2. 2013650us : __print_runqueue: Â Â Âse: f6a8da0c, comm: systemd-tmpfile/1246, load: 1024 > kworker/-9 Â Â Â 0d..2. 2013651us : __print_runqueue: Âse: f6a8cd1c, comm: kworker/u:1/1252, load: 1024 > > > And then the next pre-print looks like: > > systemd--1251 Â Â0d..5. 2015398us : enqueue_task_fair <-enqueue_task > systemd--1251 Â Â0d..5. 2015398us : print_runqueue <-enqueue_task_fair > systemd--1251 Â Â0d..5. 2015399us : __print_runqueue: Âcfs_rq: c2407c34, nr: 3, load: 3072 > systemd--1251 Â Â0d..5. 2015400us : __print_runqueue: Âcurr: f6a8de5c, comm: systemd-cgroups/1251, load: 1024 > systemd--1251 Â Â0d..5. 2015401us : __print_runqueue: Âse: f69e6300, load: 1024, > systemd--1251 Â Â0d..5. 2015401us : __print_runqueue: Â Âcfs_rq: f69e6540, nr: 2, load: 2048 > systemd--1251 Â Â0d..5. 2015402us : __print_runqueue: Â Âcurr: (null) > systemd--1251 Â Â0d..5. 2015402us : __print_runqueue: Â Âse: f69e65a0, load: 4137574976, the load == f69e65a0 == address of se, odd Thanks, Yong > systemd--1251 Â Â0d..5. 2015402us : __print_runqueue: Â Â Âcfs_rq: f69e6240, nr: 1, load: 4137575360 > systemd--1251 Â Â0d..5. 2015403us : __print_runqueue: Â Â Âcurr: (null) > systemd--1251 Â Â0d..5. 2015404us : __print_runqueue: Â Â Âse: f725916c, comm: loadkeys/1225, load: 1024 > systemd--1251 Â Â0d..5. 2015404us : __print_runqueue: Â Âse: f69e6660, load: 1024, > systemd--1251 Â Â0d..5. 2015404us : __print_runqueue: Â Â Âcfs_rq: f69e6600, nr: 1, load: 1024 > systemd--1251 Â Â0d..5. 2015405us : __print_runqueue: Â Â Âcurr: (null) > systemd--1251 Â Â0d..5. 2015406us : __print_runqueue: Â Â Âse: f6a8da0c, comm: systemd-tmpfile/1246, load: 1024 > systemd--1251 Â Â0d..5. 2015406us : __print_runqueue: Âse: f6a8cd1c, comm: kworker/u:1/1252, load: 1024 > > > The load value being crazy like this leads to dequeue_task_fair() making > weird decisions since it checks for load==0 to dequeue groups. > > Not doing that, then leads to 'dead' groups still being enqueued and > eventually referencing freed memory etc.. > > I've no clue why any of this happens, it could be random scribbles, > except the se/cfs_rq combo are always off together (although not always > by the same amount). > > Anyway, my brain needs sleep, I'll continue poking tomorrow. > > --- > Âkernel/sched_fair.c | Â 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ > Â1 files changed, 71 insertions(+), 0 deletions(-) > > diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c > index db3f674..8b8b038 100644 > --- a/kernel/sched_fair.c > +++ b/kernel/sched_fair.c > @@ -333,6 +333,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) > Â Â Â Âs64 key = entity_key(cfs_rq, se); > Â Â Â Âint leftmost = 1; > > + Â Â Â trace_printk("%p to %p, %ld tasks\n", se, cfs_rq, cfs_rq->nr_running); > + > Â Â Â Â/* > Â Â Â Â * Find the right place in the rbtree: > Â Â Â Â */ > @@ -364,6 +366,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) > > Âstatic void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) > Â{ > + Â Â Â trace_printk("%p from %p, %ld left\n", se, cfs_rq, cfs_rq->nr_running); > + > Â Â Â Âif (cfs_rq->rb_leftmost == &se->run_node) { > Â Â Â Â Â Â Â Âstruct rb_node *next_node; > > @@ -394,6 +398,53 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) > Â Â Â Âreturn rb_entry(last, struct sched_entity, run_node); > Â} > > +#define INDENT(i) ((i)*2), " Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â" > + > +static void notrace __print_runqueue(struct cfs_rq *cfs_rq, int indent); > + > +static void __always_inline > +__print_entity(struct sched_entity *se, char *name, int indent) > +{ > + Â Â Â if (!se) { > + Â Â Â Â Â Â Â trace_printk("%.*s %s: %p\n", INDENT(indent), name, se); > + Â Â Â Â Â Â Â return; > + Â Â Â } > + > + Â Â Â if (entity_is_task(se)) { > + Â Â Â Â Â Â Â struct task_struct *p = task_of(se); > + Â Â Â Â Â Â Â trace_printk("%.*s %s: %p, comm: %s/%d, load: %ld\n", > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â INDENT(indent), name, > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â se, p->comm, p->pid, se->load.weight); > + Â Â Â } else { > + Â Â Â Â Â Â Â trace_printk("%.*s %s: %p, load: %ld, ", > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â INDENT(indent), name, > + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â se, se->load.weight); > + Â Â Â Â Â Â Â __print_runqueue(group_cfs_rq(se), indent+1); > + Â Â Â } > +} > + > +static void notrace __print_runqueue(struct cfs_rq *cfs_rq, int indent) > +{ > + Â Â Â struct rb_node *n; > + > + Â Â Â trace_printk("%.*s cfs_rq: %p, nr: %ld, load: %ld\n", > + Â Â Â Â Â Â Â Â Â Â Â INDENT(indent), > + Â Â Â Â Â Â Â Â Â Â Â cfs_rq, cfs_rq->nr_running, > + Â Â Â Â Â Â Â Â Â Â Â cfs_rq->load.weight); > + > + Â Â Â __print_entity(cfs_rq->curr, "curr", indent); > + > + Â Â Â for (n = cfs_rq->rb_leftmost; n; n = rb_next(n)) { > + Â Â Â Â Â Â Â struct sched_entity *se = rb_entry(n, struct sched_entity, run_node); > + Â Â Â Â Â Â Â __print_entity(se, "se", indent); > + Â Â Â } > +} > + > +void print_runqueue(struct cfs_rq *cfs_rq) > +{ > + Â Â Â __print_runqueue(cfs_rq, 0); > +} > + > Â/************************************************************** > Â* Scheduling class statistics methods: > Â*/ > @@ -1047,6 +1098,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) > Â Â Â Âstruct cfs_rq *cfs_rq; > Â Â Â Âstruct sched_entity *se = &p->se; > > + Â Â Â print_runqueue(&rq->cfs); > Â Â Â Âfor_each_sched_entity(se) { > Â Â Â Â Â Â Â Âif (se->on_rq) > Â Â Â Â Â Â Â Â Â Â Â Âbreak; > @@ -1055,6 +1107,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) > Â Â Â Â Â Â Â Âflags = ENQUEUE_WAKEUP; > Â Â Â Â} > > + Â Â Â trace_printk("add task: %p, %s/%d\n", &p->se, p->comm, p->pid); > + Â Â Â print_runqueue(&rq->cfs); > + > Â Â Â Âhrtick_update(rq); > Â} > > @@ -1068,6 +1123,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > Â Â Â Âstruct cfs_rq *cfs_rq; > Â Â Â Âstruct sched_entity *se = &p->se; > > + Â Â Â print_runqueue(&rq->cfs); > Â Â Â Âfor_each_sched_entity(se) { > Â Â Â Â Â Â Â Âcfs_rq = cfs_rq_of(se); > Â Â Â Â Â Â Â Âdequeue_entity(cfs_rq, se, flags); > @@ -1077,6 +1133,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) > Â Â Â Â Â Â Â Âflags |= DEQUEUE_SLEEP; > Â Â Â Â} > > + Â Â Â trace_printk("del task: %p, %s/%d\n", &p->se, p->comm, p->pid); > + Â Â Â print_runqueue(&rq->cfs); > + > Â Â Â Âhrtick_update(rq); > Â} > > @@ -1719,6 +1778,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) > Â Â Â Âstruct cfs_rq *cfs_rq = &rq->cfs; > Â Â Â Âstruct sched_entity *se; > > + Â Â Â print_runqueue(&rq->cfs); > + > Â Â Â Âif (!cfs_rq->nr_running) > Â Â Â Â Â Â Â Âreturn NULL; > > @@ -1731,6 +1792,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) > Â Â Â Âp = task_of(se); > Â Â Â Âhrtick_start_fair(rq, p); > > + Â Â Â trace_printk("picked: %p, %s/%d\n", se, p->comm, p->pid); > + Â Â Â print_runqueue(&rq->cfs); > + > Â Â Â Âreturn p; > Â} > > @@ -1742,10 +1806,15 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) > Â Â Â Âstruct sched_entity *se = &prev->se; > Â Â Â Âstruct cfs_rq *cfs_rq; > > + Â Â Â print_runqueue(&rq->cfs); > Â Â Â Âfor_each_sched_entity(se) { > Â Â Â Â Â Â Â Âcfs_rq = cfs_rq_of(se); > Â Â Â Â Â Â Â Âput_prev_entity(cfs_rq, se); > Â Â Â Â} > + > + Â Â Â trace_printk("put: %p, %s/%d\n", se, prev->comm, prev->pid); > + Â Â Â print_runqueue(&rq->cfs); > + > Â} > > Â#ifdef CONFIG_SMP > @@ -3819,8 +3888,10 @@ static void set_curr_task_fair(struct rq *rq) > Â{ > Â Â Â Âstruct sched_entity *se = &rq->curr->se; > > + Â Â Â print_runqueue(&rq->cfs); > Â Â Â Âfor_each_sched_entity(se) > Â Â Â Â Â Â Â Âset_next_entity(cfs_rq_of(se), se); > + Â Â Â print_runqueue(&rq->cfs); > Â} > > Â#ifdef CONFIG_FAIR_GROUP_SCHED > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at Âhttp://vger.kernel.org/majordomo-info.html > Please read the FAQ at Âhttp://www.tux.org/lkml/ > -- Only stand for myself. -- To unsubscribe from this list: send the line "unsubscribe kernel-janitors" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html