The patch titled pi-futex: scheduler support for pi has been removed from the -mm tree. Its filename is pi-futex-scheduler-support-for-pi.patch This patch was dropped because it was merged into mainline or a subsystem tree ------------------------------------------------------ Subject: pi-futex: scheduler support for pi From: Ingo Molnar <mingo@xxxxxxx> Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting The effective_prio() cleanups also fix a priority-calculation bug pointed out by Andrey Gelman, in set_user_nice(). has_rt_policy() fix: Peter Williams <pwil3058@xxxxxxxxxxxxxx> Signed-off-by: Ingo Molnar <mingo@xxxxxxx> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Signed-off-by: Arjan van de Ven <arjan@xxxxxxxxxxxxxxx> Cc: Andrey Gelman <agelman@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- include/linux/init_task.h | 2 include/linux/sched.h | 21 +++- kernel/sched.c | 189 ++++++++++++++++++++++++++++++------ 3 files changed, 181 insertions(+), 31 deletions(-) diff -puN include/linux/init_task.h~pi-futex-scheduler-support-for-pi include/linux/init_task.h --- a/include/linux/init_task.h~pi-futex-scheduler-support-for-pi +++ a/include/linux/init_task.h @@ -87,6 +87,7 @@ extern struct group_info init_groups; .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ @@ -122,6 +123,7 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ + .pi_lock = SPIN_LOCK_UNLOCKED, \ } diff -puN include/linux/sched.h~pi-futex-scheduler-support-for-pi include/linux/sched.h --- a/include/linux/sched.h~pi-futex-scheduler-support-for-pi +++ a/include/linux/sched.h @@ -495,8 +495,11 @@ struct signal_struct { #define MAX_PRIO (MAX_RT_PRIO + 40) -#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +#define has_rt_policy(p) \ + unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH) /* * Some day this will be a full-fledged user tracking system.. @@ -725,7 +728,7 @@ struct task_struct { #endif #endif int load_weight; /* for niceness load balancing purposes */ - int prio, static_prio; + int prio, static_prio, normal_prio; struct list_head run_list; prio_array_t *array; @@ -852,6 +855,9 @@ struct task_struct { /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ spinlock_t alloc_lock; + /* Protection of the PI data structures: */ + spinlock_t pi_lock; + #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; @@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) #endif extern void sched_idle_next(void); + +#ifdef CONFIG_RT_MUTEXES +extern int rt_mutex_getprio(task_t *p); +extern void rt_mutex_setprio(task_t *p, int prio); +#else +static inline int rt_mutex_getprio(task_t *p) +{ + return p->normal_prio; +} +#endif + extern void set_user_nice(task_t *p, long nice); extern int task_prio(const task_t *p); extern int task_nice(const task_t *p); diff -puN kernel/sched.c~pi-futex-scheduler-support-for-pi kernel/sched.c --- a/kernel/sched.c~pi-futex-scheduler-support-for-pi +++ a/kernel/sched.c @@ -355,6 +355,25 @@ static inline void finish_lock_switch(ru #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline runqueue_t *__task_rq_lock(task_t *p) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); + goto repeat_lock_task; + } + return rq; +} + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. @@ -375,6 +394,12 @@ repeat_lock_task: return rq; } +static inline void __task_rq_unlock(runqueue_t *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) __releases(rq->lock) { @@ -638,7 +663,7 @@ static inline void enqueue_task_head(str } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -651,13 +676,11 @@ static inline void enqueue_task_head(str * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __normal_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -692,7 +715,7 @@ static int effective_prio(task_t *p) static void set_load_weight(task_t *p) { - if (rt_task(p)) { + if (has_rt_policy(p)) { #ifdef CONFIG_SMP if (p == task_rq(p)->migration_thread) /* @@ -731,6 +754,44 @@ static inline void dec_nr_running(task_t } /* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(task_t *p) +{ + int prio; + + if (has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(task_t *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* * __activate_task - move a task to the runqueue. */ static void __activate_task(task_t *p, runqueue_t *rq) @@ -752,6 +813,10 @@ static inline void __activate_idle_task( inc_nr_running(p, rq); } +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ @@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + INIT_LIST_HEAD(&p->run_list); p->array = NULL; #ifdef CONFIG_SCHEDSTATS @@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p __activate_task(p, rq); else { p->prio = current->prio; + p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; @@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(w EXPORT_SYMBOL(sleep_on_timeout); +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} + +#endif + void set_user_nice(task_t *p, long nice) { unsigned long flags; prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; + int old_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice) * it wont have any effect on scheduling until the task is * not SCHED_NORMAL/SCHED_BATCH: */ - if (rt_task(p)) { + if (has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice) dec_raw_weighted_load(rq, p); } - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); - p->prio += delta; + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; if (array) { enqueue_task(p, array); @@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) out_unlock: task_rq_unlock(rq, &flags); } - EXPORT_SYMBOL(set_user_nice); /* @@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { - p->prio = MAX_RT_PRIO-1 - p->rt_priority; - } else { - p->prio = p->static_prio; - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - } + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; set_load_weight(p); } @@ -3912,14 +4033,20 @@ recheck: if (retval) return retval; /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock_irqsave(&p->pi_lock, flags); + /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } array = p->array; @@ -3940,7 +4067,9 @@ recheck: } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, i idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void) if (!rt_task(p)) continue; - rq = task_rq_lock(p, &flags); + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); array = p->array; if (array) @@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); } read_unlock_irq(&tasklist_lock); } _ Patches currently in -mm which might be from mingo@xxxxxxx are origin.patch disable-debugging-version-of-write_lock.patch cpu_relax-use-in-acpi-lock-fix.patch acpi_srat-needs-acpi.patch lock-validator-fix-ns83820c-irq-flags-bug.patch revert-gregkh-pci-pci-test-that-drivers-properly-call-pci_set_master.patch x86-do_irq-check-irq-number.patch sched-clean-up-fallout-of-recent-changes.patch sched-cleanup-remove-task_t-convert-to-struct-task_struct.patch sched-cleanup-convert-schedc-internal-typedefs-to-struct.patch sched-fix-bug-in-__migrate_task.patch sched-add-above-background-load-function.patch mm-implement-swap-prefetching.patch sched-cleanup-remove-task_t-convert-to-struct-task_struct-prefetch.patch genirq-rename-desc-handler-to-desc-chip.patch genirq-rename-desc-handler-to-desc-chip-power-fix.patch genirq-rename-desc-handler-to-desc-chip-ia64-fix.patch genirq-rename-desc-handler-to-desc-chip-ia64-fix-2.patch genirq-rename-desc-handler-to-desc-chip-terminate_irqs-fix.patch genirq-sem2mutex-probe_sem-probing_active.patch genirq-cleanup-merge-irq_affinity-into-irq_desc.patch genirq-cleanup-remove-irq_descp.patch genirq-cleanup-remove-irq_descp-fix.patch genirq-cleanup-remove-fastcall.patch genirq-cleanup-misc-code-cleanups.patch genirq-cleanup-reduce-irq_desc_t-use-mark-it-obsolete.patch genirq-cleanup-include-linux-irqh.patch genirq-cleanup-merge-irq_dir-smp_affinity_entry-into-irq_desc.patch genirq-cleanup-merge-pending_irq_cpumask-into-irq_desc.patch genirq-cleanup-turn-arch_has_irq_per_cpu-into-config_irq_per_cpu.patch genirq-debug-better-debug-printout-in-enable_irq.patch genirq-add-retrigger-irq-op-to-consolidate-hw_irq_resend.patch genirq-doc-comment-include-linux-irqh-structures.patch genirq-doc-handle_irq_event-and-__do_irq-comments.patch genirq-cleanup-no_irq_type-cleanups.patch genirq-doc-add-design-documentation.patch genirq-add-genirq-sw-irq-retrigger.patch genirq-add-irq_noprobe-support.patch genirq-add-irq_norequest-support.patch genirq-add-irq_noautoen-support.patch genirq-update-copyrights.patch genirq-core.patch genirq-core-revert-noisiness-on-spurious-interrupts.patch genirq-msi-fixes-2.patch genirq-add-irq-chip-support.patch genirq-add-irq-chip-support-fix.patch genirq-add-irq-chip-support-misroute-irq-dont-call-desc-chip-end.patch genirq-add-handle_bad_irq.patch genirq-add-irq-wake-power-management-support.patch genirq-add-sa_trigger-support.patch genirq-cleanup-no_irq_type-no_irq_chip-rename.patch genirq-more-verbose-debugging-on-unexpected-irq-vectors.patch genirq-ia64-build-fix.patch genirq-add-irq_type_sense_mask.patch genirq-add-irq-chip-support-fasteoi-handler-handle-interrupt-disabling.patch genirq-irq-document-what-an-irq-is.patch genirq-add-chip-eoi-fastack-fasteoi-core.patch genirq-add-chip-eoi-fastack-fasteoi-fix.patch lockdep-floppyc-irq-release-fix.patch lockdep-acpi-locking-fix.patch lockdep-console_init-after-local_irq_enable.patch lockdep-add-is_module_address.patch lockdep-add-print_ip_sym.patch lockdep-add-per_cpu_offset.patch lockdep-add-disable-enable_irq_lockdep-api.patch lockdep-add-local_irq_enable_in_hardirq-api.patch lockdep-add-declare_completion_onstack-api.patch lockdep-clean-up-rwsems.patch lockdep-remove-rwsem_debug-remnants.patch lockdep-rename-debug_warn_on.patch lockdep-remove-debug_bug_on.patch lockdep-remove-mutex-deadlock-checking-code.patch lockdep-better-lock-debugging.patch lockdep-mutex-section-binutils-workaround.patch lockdep-locking-init-debugging-improvement.patch lockdep-beautify-x86_64-stacktraces.patch lockdep-x86_64-document-stack-frame-internals.patch lockdep-i386-remove-multi-entry-backtraces.patch lockdep-stacktrace-subsystem-core.patch lockdep-s390-config_frame_pointer-support.patch lockdep-stacktrace-subsystem-i386-support.patch lockdep-stacktrace-subsystem-x86_64-support.patch lockdep-stacktrace-subsystem-s390-support.patch lockdep-irqtrace-subsystem-core.patch lockdep-irqtrace-subsystem-docs.patch lockdep-irqtrace-subsystem-i386-support.patch lockdep-irqtrace-cleanup-of-include-asm-i386-irqflagsh.patch lockdep-irqtrace-subsystem-x86_64-support.patch lockdep-irqtrace-subsystem-x86_64-support-fix.patch lockdep-irqtrace-subsystem-x86_64-support-fix-2.patch lockdep-irqtrace-cleanup-of-include-asm-x86_64-irqflagsh.patch lockdep-irqtrace-subsystem-s390-support.patch lockdep-locking-api-self-tests.patch lockdep-core.patch lockdep-design-docs.patch lockdep-procfs.patch lockdep-prove-rwsem-locking-correctness.patch lockdep-prove-spinlock-rwlock-locking-correctness.patch lockdep-prove-mutex-locking-correctness.patch lockdep-kconfig.patch lockdep-print-all-lock-classes-on-sysrq-d.patch lockdep-x86_64-early-init.patch lockdep-x86-smp-alternatives-workaround.patch lockdep-do-not-recurse-in-printk.patch lockdep-fix-rt_hash_lock_sz.patch lockdep-s390-turn-validator-off-in-machine-check-handler.patch lockdep-enable-on-i386.patch lockdep-enable-on-x86_64.patch lockdep-enable-on-s390.patch lockdep-annotate-direct-io.patch lockdep-annotate-serial.patch lockdep-annotate-dcache.patch lockdep-annotate-i_mutex.patch lockdep-annotate-futex.patch lockdep-annotate-genirq.patch lockdep-annotate-waitqueues.patch lockdep-annotate-mm.patch lockdep-annotate-serio.patch lockdep-annotate-skb_queue_head_init.patch lockdep-annotate-timer-base-locks.patch lockdep-annotate-scheduler-runqueue-locks.patch lockdep-annotate-hrtimer-base-locks.patch lockdep-annotate-sock_lock_init.patch lockdep-annotate-af_unix-locking.patch lockdep-annotate-bh_lock_sock.patch lockdep-annotate-ieee1394-skb-queue-head-locking.patch lockdep-annotate-mmap_sem.patch lockdep-annotate-sunrpc-code.patch lockdep-annotate-ntfs-locking-rules.patch lockdep-annotate-the-quota-code.patch lockdep-annotate-usbfs.patch lockdep-annotate-sound-core-seq-seq_portsc.patch lockdep-annotate-sound-core-seq-seq_devicec.patch lockdep-annotate-8390c-disable_irq.patch lockdep-annotate-3c59xc-disable_irq.patch lockdep-annotate-forcedethc-disable_irq.patch lockdep-annotate-enable_in_hardirq.patch lockdep-annotate-on-stack-completions.patch lockdep-annotate-qeth-driver.patch lockdep-annotate-s_lock.patch lockdep-annotate-sb-s_umount.patch lockdep-annotate-slab-code.patch lockdep-annotate-blkdev-nesting.patch lockdep-annotate-vlan-net-device-as-being-a-special-class.patch lockdep-annotate-vlan-net-device-as-being-a-special-class-fix.patch detect-atomic-counter-underflows.patch debug-shared-irqs.patch make-frame_pointer-default=y.patch mutex-subsystem-synchro-test-module.patch vdso-print-fatal-signals.patch vdso-improve-print_fatal_signals-support-by-adding-memory-maps.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html