Thanks Peter. We'll give that patch a try as part of our refactoring. Looking at finer-grained locking and we'll try going back to rt_mutex plus this patch. On Wed, Sep 14, 2016 at 9:55 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: > On Wed, Sep 14, 2016 at 06:13:40PM +0200, Peter Zijlstra wrote: >> On Wed, Sep 14, 2016 at 06:11:03PM +0200, Peter Zijlstra wrote: >> > On Tue, Sep 13, 2016 at 12:53:27PM -0700, Arve Hjønnevåg wrote: >> > > On Tue, Sep 13, 2016 at 12:32 AM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: >> > > > cgroups should be irrelevant, PI is unaware of them. >> > > >> > > I don't think cgroups are irrelevant. PI being unaware of them >> > > explains the problem I described. If the task that holds the lock is >> > > in a cgroup that has a low cpu.shares value, then boosting the task's >> > > priority within that group does necessarily make it any more likely to >> > > run. >> > >> > Thing is, for FIFO/DL the important parameters (prio and deadline resp.) >> > are not cgroup dependent. >> > >> > For CFS you're right, and as per usual, cgroups will be a royal pain. >> > While we can compute the total weight in the block chain, getting that >> > back to a task which is stuck in a cgroup is just not going to work >> > well. >> >> Not to mention the fact that the weight depends on the current running >> state. Having those tasks block insta changes the actual weight. >> >> > /me curses @ cgroups.. bloody stupid things. >> >> More of that. > > > Something a little like so... completely untested, and has a fair number > of corner cases still left open I think.. > > > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1691,8 +1691,8 @@ struct task_struct { > struct seccomp seccomp; > > /* Thread group tracking */ > - u32 parent_exec_id; > - u32 self_exec_id; > + u32 parent_exec_id; > + u32 self_exec_id; > /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, > * mempolicy */ > spinlock_t alloc_lock; > @@ -1710,6 +1710,11 @@ struct task_struct { > struct task_struct *pi_top_task; > /* Deadlock detection and priority inheritance handling */ > struct rt_mutex_waiter *pi_blocked_on; > + unsigned long pi_weight; > +#ifdef CONFIG_CGROUP_SCHED > + struct task_group *orig_task_group; > + unsigned long normal_weight; > +#endif > #endif > > #ifdef CONFIG_DEBUG_MUTEXES > @@ -1977,6 +1982,8 @@ static inline int tsk_nr_cpus_allowed(st > return p->nr_cpus_allowed; > } > > +extern unsigned long task_pi_weight(struct task_struct *p); > + > #define TNF_MIGRATED 0x01 > #define TNF_NO_GROUP 0x02 > #define TNF_SHARED 0x04 > --- a/kernel/locking/rtmutex.c > +++ b/kernel/locking/rtmutex.c > @@ -20,6 +20,19 @@ > #include "rtmutex_common.h" > > /* > + * !(rt_prio || dl_prio) > + */ > +static inline bool other_prio(int prio) > +{ > + return prio >= MAX_RT_PRIO; > +} > + > +static inline bool other_task(struct task_struct *task) > +{ > + return other_prio(task->prio); > +} > + > +/* > * lock->owner state tracking: > * > * lock->owner holds the task_struct pointer of the owner. Bit 0 > @@ -226,6 +239,11 @@ rt_mutex_enqueue(struct rt_mutex *lock, > > rb_link_node(&waiter->tree_entry, parent, link); > rb_insert_color(&waiter->tree_entry, &lock->waiters); > + /* > + * We want the lock->waiter_weight to reflect the sum of all queued > + * waiters. > + */ > + lock->waiters_weight += waiter->weight; > } > > static void > @@ -239,6 +257,7 @@ rt_mutex_dequeue(struct rt_mutex *lock, > > rb_erase(&waiter->tree_entry, &lock->waiters); > RB_CLEAR_NODE(&waiter->tree_entry); > + lock->waiters_weight += waiter->weight; > } > > static void > @@ -265,6 +284,18 @@ rt_mutex_enqueue_pi(struct task_struct * > > rb_link_node(&waiter->pi_tree_entry, parent, link); > rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); > + > + /* > + * Since a task can own multiple locks, and we have one pi_waiter > + * for every lock, propagate the lock->waiter_weight, which is the sum > + * of all weights queued on that lock, into the top waiter, and add > + * that to the owning task's pi_weight. > + * > + * This results in pi_weight being the sum of all blocked waiters > + * on this task. > + */ > + waiter->top_weight = waiter->lock->waiters_weight; > + task->pi_weight += waiter->top_weight; > } > > static void > @@ -278,6 +309,7 @@ rt_mutex_dequeue_pi(struct task_struct * > > rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); > RB_CLEAR_NODE(&waiter->pi_tree_entry); > + task->pi_weight -= waiter->top_weight; > } > > static void rt_mutex_adjust_prio(struct task_struct *p) > @@ -497,7 +529,7 @@ static int rt_mutex_adjust_prio_chain(st > * detection is enabled we continue, but stop the > * requeueing in the chain walk. > */ > - if (top_waiter != task_top_pi_waiter(task)) { > + if (top_waiter != task_top_pi_waiter(task) && !other_task(task)) { > if (!detect_deadlock) > goto out_unlock_pi; > else > @@ -512,7 +544,7 @@ static int rt_mutex_adjust_prio_chain(st > * enabled we continue, but stop the requeueing in the chain > * walk. > */ > - if (rt_mutex_waiter_equal(waiter, cmp_task(task))) { > + if (rt_mutex_waiter_equal(waiter, cmp_task(task)) && !other_task(task)) { > if (!detect_deadlock) > goto out_unlock_pi; > else > @@ -627,6 +659,11 @@ static int rt_mutex_adjust_prio_chain(st > */ > waiter->prio = task->prio; > waiter->deadline = task->dl.deadline; > + /* > + * The weight of the task depends on the block chain, since > + * we're iterating that, its likely to have changed. > + */ > + waiter->weight = task_pi_weight(task); > > rt_mutex_enqueue(lock, waiter); > > @@ -685,6 +722,15 @@ static int rt_mutex_adjust_prio_chain(st > waiter = rt_mutex_top_waiter(lock); > rt_mutex_enqueue_pi(task, waiter); > rt_mutex_adjust_prio(task); > + > + } else if (other_task(task)) { > + /* > + * Propagate the lock->waiters_weight into task->pi_weight > + */ > + rt_mutex_dequeue_pi(task, prerequeue_top_waiter); > + rt_mutex_enqueue_pi(task, prerequeue_top_waiter); > + rt_mutex_adjust_prio(task); > + > } else { > /* > * Nothing changed. No need to do any priority > @@ -728,7 +774,7 @@ static int rt_mutex_adjust_prio_chain(st > * then we can stop the chain walk here if we are not in full > * deadlock detection mode. > */ > - if (!detect_deadlock && waiter != top_waiter) > + if (!detect_deadlock && waiter != top_waiter && !other_task(task)) > goto out_put_task; > > goto again; > @@ -904,6 +950,7 @@ static int task_blocks_on_rt_mutex(struc > waiter->lock = lock; > waiter->prio = task->prio; > waiter->deadline = task->dl.deadline; > + waiter->weight = task_pi_weight(task); > > /* Get the top priority waiter on the lock */ > if (rt_mutex_has_waiters(lock)) > @@ -918,7 +965,7 @@ static int task_blocks_on_rt_mutex(struc > return 0; > > raw_spin_lock(&owner->pi_lock); > - if (waiter == rt_mutex_top_waiter(lock)) { > + if (other_task(owner) || waiter == rt_mutex_top_waiter(lock)) { > rt_mutex_dequeue_pi(owner, top_waiter); > rt_mutex_enqueue_pi(owner, waiter); > > @@ -1017,7 +1064,8 @@ static void mark_wakeup_next_waiter(stru > static void remove_waiter(struct rt_mutex *lock, > struct rt_mutex_waiter *waiter) > { > - bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); > + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); > + bool is_top_waiter = (waiter == top_waiter); > struct task_struct *owner = rt_mutex_owner(lock); > struct rt_mutex *next_lock; > > @@ -1032,12 +1080,12 @@ static void remove_waiter(struct rt_mute > * Only update priority if the waiter was the highest priority > * waiter of the lock and there is an owner to update. > */ > - if (!owner || !is_top_waiter) > + if (!owner || (!other_task(owner) && !is_top_waiter)) > return; > > raw_spin_lock(&owner->pi_lock); > > - rt_mutex_dequeue_pi(owner, waiter); > + rt_mutex_dequeue_pi(owner, top_waiter); > > if (rt_mutex_has_waiters(lock)) > rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); > --- a/kernel/locking/rtmutex_common.h > +++ b/kernel/locking/rtmutex_common.h > @@ -34,6 +34,7 @@ struct rt_mutex_waiter { > #endif > int prio; > u64 deadline; > + unsigned weight, top_weight; > }; > > /* > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3725,6 +3725,8 @@ void rt_mutex_setprio(struct task_struct > if (rt_prio(oldprio)) > p->rt.timeout = 0; > p->sched_class = &fair_sched_class; > + > + task_set_pi_weight(p); > } > > p->prio = prio; > @@ -7933,6 +7935,12 @@ static void sched_change_group(struct ta > tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), > struct task_group, css); > tg = autogroup_task_group(tsk, tg); > + > +#ifdef CONFIG_RT_MUTEXES > + tsk->orig_task_group = tg; > + > + if (!tsk->pi_weight) > +#endif > tsk->sched_task_group = tg; > > #ifdef CONFIG_FAIR_GROUP_SCHED > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6646,6 +6646,32 @@ static void attach_tasks(struct lb_env * > raw_spin_unlock(&env->dst_rq->lock); > } > > +#ifdef CONFIG_RT_MUTEXES > +/* > + * See set_load_weight(). > + */ > +static inline unsigned long __task_normal_weight(struct task_struct *p) > +{ > + int prio = p->static_prio - MAX_RT_PRIO; > + > + /* > + * SCHED_IDLE tasks get minimal weight: > + */ > + if (idle_policy(p->policy)) > + return scale_load(WEIGHT_IDLEPRIO); > + > + return scale_load(sched_prio_to_weight[prio]); > +} > + > + > +static unsigned long task_normal_weight(struct task_struct *p); > + > +unsigned long task_pi_weight(struct task_struct *p) > +{ > + return task_normal_weight(p) + p->pi_weight; > +} > +#endif > + > #ifdef CONFIG_FAIR_GROUP_SCHED > static void update_blocked_averages(int cpu) > { > @@ -6717,7 +6743,80 @@ static unsigned long task_h_load(struct > return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, > cfs_rq_load_avg(cfs_rq) + 1); > } > -#else > + > +#ifdef CONFIG_RT_MUTEXES > +/* > + * Humongous horrid hack.. because cgroups bloody stink. > + * > + * The idea with PI on CFS is to sum the weight of all blocked tasks but with > + * cgroups the weight of a task depends on the running state of tasks. Blocking > + * changes the weight. > + * > + * Paper over that problem by using the regular averages, and hoping boosting > + * is short enough to not actually matter much. > + * > + * The next problem is that getting that weight back to the boosted task > + * requires lifting it out of its cgroup. Ideally we'd place it in the first > + * common parent, but *shees* then we'd have to compute that, so bail and stick > + * it in the root group. > + */ > + > +static unsigned long task_normal_weight(struct task_struct *p) > +{ > + struct cfs_rq *cfs_rq = task_cfs_rq(p); > + > + /* > + * Once we have ->pi_weight, we'll get boosted into the root group > + * and the below falls apart. > + */ > + if (!p->pi_weight) { > + update_cfs_rq_h_load(cfs_rq); > + p->normal_weight = > + div64_ul(__task_normal_weight(p) * cfs_rq->h_load, > + cfs_rq_load_avg(cfs_rq) + 1); > + } > + > + return p->normal_weight; > +} > + > +static void detach_task_cfs_rq(struct task_struct *p); > +static void attach_task_cfs_rq(struct task_struct *p); > + > +void task_set_pi_weight(struct task_struct *p) > +{ > + unsigned long normal_weight = p->normal_weight; > + struct task_group *tg = &root_task_group; > + bool move_group; > + > + if (!p->pi_weight) { > + tg = p->orig_task_group; > + normal_weight = __task_normal_weight(p); > + } > + > + move_group = p->sched_task_group != tg; > + > + if (move_group) { > + p->sched_task_group = tg; > + > + detach_task_cfs_rq(p); > + set_task_rq(p, task_cpu(p)); > + > +#ifdef CONFIG_SMP > + /* Tell se's cfs_rq has been changed -- migrated */ > + p->se.avg.last_update_time = 0; > +#endif > + } > + > + update_load_set(&p->se.load, normal_weight + p->pi_weight); > + > + if (move_group) > + attach_task_cfs_rq(p); > +} > + > +#endif > + > +#else /* CONFIG_FAIR_GROUP_SCHED */ > + > static inline void update_blocked_averages(int cpu) > { > struct rq *rq = cpu_rq(cpu); > @@ -6734,8 +6833,21 @@ static unsigned long task_h_load(struct > { > return p->se.avg.load_avg; > } > + > +#ifdef CONFIG_RT_MUTEXES > +static unsigned long task_normal_weight(struct task_struct *p) > +{ > + return __task_normal_weight(p); > +} > + > +void task_set_pi_weight(struct task_struct *p) > +{ > + update_load_set(&p->se.load, task_pi_weight(p)); > +} > #endif > > +#endif /* CONFIG_FAIR_GROUP_SCHED */ > + > /********** Helpers for find_busiest_group ************************/ > > enum group_type { > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1827,3 +1827,5 @@ static inline void cpufreq_trigger_updat > #else /* arch_scale_freq_capacity */ > #define arch_scale_freq_invariant() (false) > #endif > + > +extern void task_set_pi_weight(struct task_struct *p); _______________________________________________ devel mailing list devel@xxxxxxxxxxxxxxxxxxxxxx http://driverdev.linuxdriverproject.org/mailman/listinfo/driverdev-devel