The patch titled PI-futex: scheduler support for PI has been removed from the -mm tree. Its filename is pi-futex-scheduler-support-for-pi.patch This patch was probably dropped from -mm because it has now been merged into a subsystem tree or into Linus's tree, or because it was folded into its parent patch in the -mm tree. ------------------------------------------------------ Subject: PI-futex: scheduler support for PI From: Ingo Molnar <mingo@xxxxxxx> Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting Signed-off-by: Ingo Molnar <mingo@xxxxxxx> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Signed-off-by: Arjan van de Ven <arjan@xxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- include/linux/init_task.h | 1 include/linux/sched.h | 19 ++++- kernel/sched.c | 136 +++++++++++++++++++++++++++++++----- 3 files changed, 138 insertions(+), 18 deletions(-) diff -puN include/linux/init_task.h~pi-futex-scheduler-support-for-pi include/linux/init_task.h --- devel/include/linux/init_task.h~pi-futex-scheduler-support-for-pi 2006-05-19 16:01:32.000000000 -0700 +++ devel-akpm/include/linux/init_task.h 2006-05-19 16:01:32.000000000 -0700 @@ -87,6 +87,7 @@ extern struct group_info init_groups; .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ diff -puN include/linux/sched.h~pi-futex-scheduler-support-for-pi include/linux/sched.h --- devel/include/linux/sched.h~pi-futex-scheduler-support-for-pi 2006-05-19 16:01:32.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2006-05-19 16:01:32.000000000 -0700 @@ -484,7 +484,8 @@ struct signal_struct { #define MAX_PRIO (MAX_RT_PRIO + 40) -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) /* @@ -769,7 +770,7 @@ struct task_struct { #endif #endif int load_weight; /* for niceness load balancing purposes */ - int prio, static_prio; + int prio, static_prio, normal_prio; struct list_head run_list; prio_array_t *array; @@ -898,6 +899,9 @@ struct task_struct { /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; + /* Protection of the PI data structures: */ + spinlock_t pi_lock; + #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; @@ -1069,6 +1073,17 @@ static inline void idle_task_exit(void) #endif extern void sched_idle_next(void); + +#ifdef CONFIG_RT_MUTEXES +extern int rt_mutex_getprio(task_t *p); +extern void rt_mutex_setprio(task_t *p, int prio); +#else +static inline int rt_mutex_getprio(task_t *p) +{ + return p->normal_prio; +} +#endif + extern void set_user_nice(task_t *p, long nice); extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); diff -puN kernel/sched.c~pi-futex-scheduler-support-for-pi kernel/sched.c --- devel/kernel/sched.c~pi-futex-scheduler-support-for-pi 2006-05-19 16:01:32.000000000 -0700 +++ devel-akpm/kernel/sched.c 2006-05-19 16:01:32.000000000 -0700 @@ -666,7 +666,7 @@ static inline void enqueue_task_head(str } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -679,13 +679,11 @@ static inline void enqueue_task_head(str * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __normal_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -801,6 +799,44 @@ static inline void dec_nr_running(task_t } /* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(task_t *p) +{ + int prio; + + if (p->policy != SCHED_NORMAL && p->policy != SCHED_BATCH) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(task_t *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* * __activate_task - move a task to the runqueue. */ static void __activate_task(task_t *p, runqueue_t *rq) @@ -822,6 +858,10 @@ static inline void __activate_idle_task( inc_nr_running(p, rq); } +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ @@ -1587,6 +1627,7 @@ void fastcall wake_up_new_task(task_t *p __activate_task(p, rq); else { p->prio = current->prio; + p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; @@ -3700,6 +3741,59 @@ long fastcall __sched sleep_on_timeout(w EXPORT_SYMBOL(sleep_on_timeout); +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} + +#endif + void set_user_nice(task_t *p, long nice) { unsigned long flags; @@ -3865,16 +3959,16 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { - p->prio = MAX_RT_PRIO-1 - p->rt_priority; - } else { - p->prio = p->static_prio; - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - } + + p->normal_prio = normal_prio(p); + /* we are holding p->pi_list already */ + p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; + set_load_weight(p); } @@ -3942,6 +4036,11 @@ recheck: if (retval) return retval; /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock(&p->pi_lock); + /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ @@ -3950,6 +4049,7 @@ recheck: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, &flags); + spin_unlock(&p->pi_lock); goto recheck; } array = p->array; @@ -3971,6 +4071,8 @@ recheck: resched_task(rq->curr); } task_rq_unlock(rq, &flags); + spin_unlock(&p->pi_lock); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -4602,7 +4704,7 @@ void __devinit init_idle(task_t *idle, i idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -6602,6 +6704,7 @@ void normalize_rt_tasks(void) if (!rt_task(p)) continue; + spin_lock(&p->pi_lock); rq = task_rq_lock(p, &flags); array = p->array; @@ -6614,6 +6717,7 @@ void normalize_rt_tasks(void) } task_rq_unlock(rq, &flags); + spin_unlock(&p->pi_lock); } read_unlock_irq(&tasklist_lock); } _ Patches currently in -mm which might be from mingo@xxxxxxx are git-acpi.patch fix-drivers-mfd-ucb1x00-corec-irq-probing-bug.patch git-infiniband.patch git-netdev-all.patch fix-for-serial-uart-lockup.patch swapless-pm-add-r-w-migration-entries-fix.patch i386-break-out-of-recursion-in-stackframe-walk.patch x86-re-enable-generic-numa.patch vdso-randomize-the-i386-vdso-by-moving-it-into-a-vma.patch vdso-randomize-the-i386-vdso-by-moving-it-into-a-vma-tidy.patch vdso-randomize-the-i386-vdso-by-moving-it-into-a-vma-arch_vma_name-fix.patch work-around-ppc64-bootup-bug-by-making-mutex-debugging-save-restore-irqs.patch kernel-kernel-cpuc-to-mutexes.patch cond-resched-might-sleep-fix.patch define-__raw_get_cpu_var-and-use-it.patch ide-cd-end-of-media-error-fix.patch spin-rwlock-init-cleanups.patch time-clocksource-infrastructure.patch sched-comment-bitmap-size-accounting.patch sched-fix-interactive-ceiling-code.patch sched-implement-smpnice.patch sched-protect-calculation-of-max_pull-from-integer-wrap.patch sched-store-weighted-load-on-up.patch sched-add-discrete-weighted-cpu-load-function.patch sched-prevent-high-load-weight-tasks-suppressing-balancing.patch sched-improve-stability-of-smpnice-load-balancing.patch sched-improve-smpnice-load-balancing-when-load-per-task.patch smpnice-dont-consider-sched-groups-which-are-lightly-loaded-for-balancing.patch smpnice-dont-consider-sched-groups-which-are-lightly-loaded-for-balancing-fix.patch sched-modify-move_tasks-to-improve-load-balancing-outcomes.patch sched-avoid-unnecessarily-moving-highest-priority-task-move_tasks.patch sched-avoid-unnecessarily-moving-highest-priority-task-move_tasks-fix-2.patch sched_domain-handle-kmalloc-failure.patch sched_domain-handle-kmalloc-failure-fix.patch sched_domain-dont-use-gfp_atomic.patch sched_domain-use-kmalloc_node.patch sched_domain-allocate-sched_group-structures-dynamically.patch sched-add-above-background-load-function.patch mm-implement-swap-prefetching-fix.patch pi-futex-scheduler-support-for-pi.patch pi-futex-rt-mutex-core.patch pi-futex-rt-mutex-core-fix-timeout-race.patch pi-futex-rt-mutex-docs.patch pi-futex-rt-mutex-debug.patch pi-futex-rt-mutex-tester.patch pi-futex-rt-mutex-futex-api.patch pi-futex-futex_lock_pi-futex_unlock_pi-support.patch pi-futex-v2.patch pi-futex-v3.patch pi-futex-patchset-v4.patch pi-futex-patchset-v4-update.patch pi-futex-patchset-v4-fix.patch rtmutex-remove-buggy-bug_on-in-pi-boosting-code.patch futex-pi-enforce-waiter-bit-when-owner-died-is-detected.patch rtmutex-debug-printk-correct-task-information.patch futex-pi-make-use-of-restart_block-when-interrupted.patch document-futex-pi-design.patch futex_requeue-optimization.patch reiser4.patch reiser4-spin-rwlock-init-cleanups.patch genirq-rename-desc-handler-to-desc-chip.patch genirq-rename-desc-handler-to-desc-chip-power-fix.patch genirq-rename-desc-handler-to-desc-chip-ia64-fix.patch genirq-rename-desc-handler-to-desc-chip-ia64-fix-2.patch genirq-sem2mutex-probe_sem-probing_active.patch genirq-cleanup-merge-irq_affinity-into-irq_desc.patch genirq-cleanup-remove-irq_descp.patch genirq-cleanup-remove-fastcall.patch genirq-cleanup-misc-code-cleanups.patch genirq-cleanup-reduce-irq_desc_t-use-mark-it-obsolete.patch genirq-cleanup-include-linux-irqh.patch genirq-cleanup-merge-irq_dir-smp_affinity_entry-into-irq_desc.patch genirq-cleanup-merge-pending_irq_cpumask-into-irq_desc.patch genirq-cleanup-turn-arch_has_irq_per_cpu-into-config_irq_per_cpu.patch genirq-debug-better-debug-printout-in-enable_irq.patch genirq-add-retrigger-irq-op-to-consolidate-hw_irq_resend.patch genirq-doc-comment-include-linux-irqh-structures.patch genirq-doc-handle_irq_event-and-__do_irq-comments.patch genirq-cleanup-no_irq_type-cleanups.patch genirq-doc-add-design-documentation.patch genirq-add-genirq-sw-irq-retrigger.patch genirq-add-irq_noprobe-support.patch genirq-add-irq_norequest-support.patch genirq-add-irq_noautoen-support.patch genirq-update-copyrights.patch genirq-core.patch genirq-add-irq-chip-support.patch genirq-add-handle_bad_irq.patch genirq-add-irq-wake-power-management-support.patch genirq-add-sa_trigger-support.patch genirq-cleanup-no_irq_type-no_irq_chip-rename.patch genirq-convert-the-x86_64-architecture-to-irq-chips.patch genirq-convert-the-i386-architecture-to-irq-chips.patch genirq-convert-the-i386-architecture-to-irq-chips-fix-2.patch genirq-more-verbose-debugging-on-unexpected-irq-vectors.patch detect-atomic-counter-underflows.patch debug-shared-irqs.patch make-frame_pointer-default=y.patch mutex-subsystem-synchro-test-module.patch vdso-print-fatal-signals.patch vdso-improve-print_fatal_signals-support-by-adding-memory-maps.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html