The patch titled pi-futex: futex_lock_pi/futex_unlock_pi support has been removed from the -mm tree. Its filename is pi-futex-futex_lock_pi-futex_unlock_pi-support.patch This patch was dropped because it was merged into mainline or a subsystem tree ------------------------------------------------------ Subject: pi-futex: futex_lock_pi/futex_unlock_pi support From: Ingo Molnar <mingo@xxxxxxx> This adds the actual pi-futex implementation, based on rt-mutexes. [dino@xxxxxxxxxx: fix an oops-causing race] Signed-off-by: Ingo Molnar <mingo@xxxxxxx> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Signed-off-by: Arjan van de Ven <arjan@xxxxxxxxxxxxxxx> Signed-off-by: Dinakar Guniguntala <dino@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- include/linux/futex.h | 7 include/linux/sched.h | 3 kernel/exit.c | 8 kernel/fork.c | 3 kernel/futex.c | 829 ++++++++++++++++++++++++++++++++++++-- kernel/futex_compat.c | 11 kernel/rtmutex_common.h | 8 7 files changed, 828 insertions(+), 41 deletions(-) diff -puN include/linux/futex.h~pi-futex-futex_lock_pi-futex_unlock_pi-support include/linux/futex.h --- a/include/linux/futex.h~pi-futex-futex_lock_pi-futex_unlock_pi-support +++ a/include/linux/futex.h @@ -12,6 +12,9 @@ #define FUTEX_REQUEUE 3 #define FUTEX_CMP_REQUEUE 4 #define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 /* * Support for robust futexes: the kernel cleans up held futexes at @@ -97,10 +100,14 @@ extern int handle_futex_death(u32 __user #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); +extern void exit_pi_state_list(struct task_struct *curr); #else static inline void exit_robust_list(struct task_struct *curr) { } +static inline void exit_pi_state_list(struct task_struct *curr) +{ +} #endif #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ diff -puN include/linux/sched.h~pi-futex-futex_lock_pi-futex_unlock_pi-support include/linux/sched.h --- a/include/linux/sched.h~pi-futex-futex_lock_pi-futex_unlock_pi-support +++ a/include/linux/sched.h @@ -84,6 +84,7 @@ struct sched_param { #include <asm/processor.h> struct exec_domain; +struct futex_pi_state; /* * List of flags we want to share for kernel threads, @@ -915,6 +916,8 @@ struct task_struct { #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; diff -puN kernel/exit.c~pi-futex-futex_lock_pi-futex_unlock_pi-support kernel/exit.c --- a/kernel/exit.c~pi-futex-futex_lock_pi-futex_unlock_pi-support +++ a/kernel/exit.c @@ -926,6 +926,14 @@ fastcall NORET_TYPE void do_exit(long co tsk->mempolicy = NULL; #endif /* + * This must happen late, after the PID is not + * hashed anymore: + */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); + /* * If DEBUG_MUTEXES is on, make sure we are holding no locks: */ mutex_debug_check_no_locks_held(tsk); diff -puN kernel/fork.c~pi-futex-futex_lock_pi-futex_unlock_pi-support kernel/fork.c --- a/kernel/fork.c~pi-futex-futex_lock_pi-futex_unlock_pi-support +++ a/kernel/fork.c @@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned lon #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; + /* * sigaltstack should be cleared when sharing the same VM */ diff -puN kernel/futex.c~pi-futex-futex_lock_pi-futex_unlock_pi-support kernel/futex.c --- a/kernel/futex.c~pi-futex-futex_lock_pi-futex_unlock_pi-support +++ a/kernel/futex.c @@ -12,6 +12,10 @@ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@xxxxxxxxxx> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@xxxxxxxxxxx> + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -46,6 +50,8 @@ #include <linux/signal.h> #include <asm/futex.h> +#include "rtmutex_common.h" + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -75,6 +81,27 @@ union futex_key { }; /* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + atomic_t refcount; + + union futex_key key; +}; + +/* * We use this hashed waitqueue instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * @@ -96,6 +123,10 @@ struct futex_q { /* For fd, sigio sent using these: */ int fd; struct file *filp; + + /* Optional priority inheritance state: */ + struct futex_pi_state *pi_state; + struct task_struct *task; }; /* @@ -259,6 +290,232 @@ static inline int get_futex_value_locked } /* + * Fault handling. Called with current->mm->mmap_sem held. + */ +static int futex_handle_fault(unsigned long address, int attempt) +{ + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + if (attempt >= 2 || !(vma = find_vma(mm, address)) || + vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) + return -EFAULT; + + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + return -EFAULT; + } + return 0; +} + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + memset(pi_state, 0, sizeof(*pi_state)); + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state * alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void free_pi_state(struct futex_pi_state *pi_state) +{ + if (!atomic_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + spin_lock_irq(&pi_state->owner->pi_lock); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + } + + if (current->pi_state_cache) + kfree(pi_state); + else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct * futex_find_get_task(pid_t pid) +{ + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid)) { + p = NULL; + goto out_unlock; + } + if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + p = NULL; + goto out_unlock; + } + get_task_struct(p); +out_unlock: + read_unlock(&tasklist_lock); + + return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ + struct futex_hash_bucket *hb; + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + union futex_key key; + + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselfs + */ + spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + spin_unlock_irq(&curr->pi_lock); + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + if (head->next != next) { + spin_unlock(&hb->lock); + continue; + } + + list_del_init(&pi_state->list); + + WARN_ON(pi_state->owner != curr); + + pi_state->owner = NULL; + spin_unlock_irq(&curr->pi_lock); + + rt_mutex_unlock(&pi_state->pi_mutex); + + spin_unlock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + } + spin_unlock_irq(&curr->pi_lock); +} + +static int +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +{ + struct futex_pi_state *pi_state = NULL; + struct futex_q *this, *next; + struct list_head *head; + struct task_struct *p; + pid_t pid; + + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &me->key)) { + /* + * Another waiter already exists - bump up + * the refcount and return its pi_state: + */ + pi_state = this->pi_state; + atomic_inc(&pi_state->refcount); + me->pi_state = pi_state; + + return 0; + } + } + + /* + * We are the first waiter - try to look up the real owner and + * attach the new pi_state to it: + */ + pid = uval & FUTEX_TID_MASK; + p = futex_find_get_task(pid); + if (!p) + return -ESRCH; + + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make 'p' + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = me->key; + + spin_lock_irq(&p->pi_lock); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + me->pi_state = pi_state; + + return 0; +} + +/* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ @@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q q->lock_ptr = NULL; } +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +{ + struct task_struct *new_owner; + struct futex_pi_state *pi_state = this->pi_state; + u32 curval, newval; + + if (!pi_state) + return -EINVAL; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + + /* + * This happens when we have stolen the lock and the original + * pending owner did not enqueue itself back on the rt_mutex. + * Thats not a tragedy. We know that way, that a lock waiter + * is on the fly. We make the futex_q waiter the pending owner. + */ + if (!new_owner) + new_owner = this->task; + + /* + * We pass it to the next owner. (The WAITERS bit is always + * kept enabled while there is PI state around. We must also + * preserve the owner died bit.) + */ + newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + + list_del_init(&pi_state->owner->pi_state_list); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + rt_mutex_unlock(&pi_state->pi_mutex); + + return 0; +} + +static int unlock_futex_pi(u32 __user *uaddr, u32 uval) +{ + u32 oldval; + + /* + * There is no waiter, so we unlock the futex. The owner died + * bit has not to be preserved here. We are the owner: + */ + inc_preempt_count(); + oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); + dec_preempt_count(); + + if (oldval == -EFAULT) + return oldval; + if (oldval != uval) + return -EAGAIN; + + return 0; +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: @@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { + if (this->pi_state) + return -EINVAL; wake_futex(this); if (++ret >= nr_wake) break; @@ -385,27 +708,9 @@ retry: * still holding the mmap_sem. */ if (attempt++) { - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - unsigned long address = (unsigned long)uaddr2; - - ret = -EFAULT; - if (attempt >= 2 || - !(vma = find_vma(mm, address)) || - vma->vm_start > address || - !(vma->vm_flags & VM_WRITE)) + if (futex_handle_fault((unsigned long)uaddr2, + attempt)) goto out; - - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: - goto out; - } goto retry; } @@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, st static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { list_add_tail(&q->list, &hb->chain); + q->task = current; spin_unlock(&hb->lock); } @@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q) } WARN_ON(list_empty(&q->list)); list_del(&q->list); + + BUG_ON(q->pi_state); + spin_unlock(lock_ptr); ret = 1; } @@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q) return ret; } +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock is held on entry and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +{ + WARN_ON(list_empty(&q->list)); + list_del(&q->list); + + BUG_ON(!q->pi_state); + free_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(&hb->lock); + + drop_key_refs(&q->key); +} + static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) { - DECLARE_WAITQUEUE(wait, current); + struct task_struct *curr = current; + DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; u32 uval; int ret; + q.pi_state = NULL; retry: - down_read(¤t->mm->mmap_sem); + down_read(&curr->mm->mmap_sem); ret = get_futex_key(uaddr, &q.key); if (unlikely(ret != 0)) @@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(¤t->mm->mmap_sem); + up_read(&curr->mm->mmap_sem); ret = get_user(uval, uaddr); @@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, goto retry; return ret; } - if (uval != val) { - ret = -EWOULDBLOCK; - queue_unlock(&q, hb); - goto out_release_sem; - } + ret = -EWOULDBLOCK; + if (uval != val) + goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ __queue_me(&q, hb); @@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, /* * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. - */ - up_read(¤t->mm->mmap_sem); + */ + up_read(&curr->mm->mmap_sem); /* * There might have been scheduling since the queue_me(), as we @@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, */ return -EINTR; + out_unlock_release_sem: + queue_unlock(&q, hb); + + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block, it does PI, etc. (Due to + * races the kernel might see a 0 value of the futex too.) + */ +static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, + struct hrtimer_sleeper *to) +{ + struct task_struct *curr = current; + struct futex_hash_bucket *hb; + u32 uval, newval, curval; + struct futex_q q; + int ret, attempt = 0; + + if (refill_pi_state_cache()) + return -ENOMEM; + + q.pi_state = NULL; + retry: + down_read(&curr->mm->mmap_sem); + + ret = get_futex_key(uaddr, &q.key); + if (unlikely(ret != 0)) + goto out_release_sem; + + hb = queue_lock(&q, -1, NULL); + + retry_locked: + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = current->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + + /* We own the lock already */ + if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (!detect && 0) + force_sig(SIGKILL, current); + ret = -EDEADLK; + goto out_unlock_release_sem; + } + + /* + * Surprise - we got the lock. Just return + * to userspace: + */ + if (unlikely(!curval)) + goto out_unlock_release_sem; + + uval = curval; + newval = uval | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, &q); + + if (unlikely(ret)) { + /* + * There were no waiters and the owner task lookup + * failed. When the OWNER_DIED bit is set, then we + * know that this is a robust futex and we actually + * take the lock. This is safe as we are protected by + * the hash bucket lock. We also set the waiters bit + * unconditionally here, to simplify glibc handling of + * multiple tasks racing to acquire the lock and + * cleanup the problems which were left by the dead + * owner. + */ + if (curval & FUTEX_OWNER_DIED) { + uval = newval; + newval = current->pid | + FUTEX_OWNER_DIED | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + ret = 0; + } + goto out_unlock_release_sem; + } + + /* + * Only actually queue now that the atomic ops are done: + */ + __queue_me(&q, hb); + + /* + * Now the futex is queued and we have checked the data, we + * don't want to hold mmap_sem while we sleep. + */ + up_read(&curr->mm->mmap_sem); + + WARN_ON(!q.pi_state); + /* + * Block on the PI mutex: + */ + if (!trylock) + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + else { + ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } + + down_read(&curr->mm->mmap_sem); + hb = queue_lock(&q, -1, NULL); + + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (!ret && q.pi_state->owner != curr) { + u32 newtid = current->pid | FUTEX_WAITERS; + + /* Owner died? */ + if (q.pi_state->owner != NULL) { + spin_lock_irq(&q.pi_state->owner->pi_lock); + list_del_init(&q.pi_state->list); + spin_unlock_irq(&q.pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + q.pi_state->owner = current; + + spin_lock_irq(¤t->pi_lock); + list_add(&q.pi_state->list, ¤t->pi_state_list); + spin_unlock_irq(¤t->pi_lock); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_user(uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + } + + if (!detect && ret == -EDEADLK && 0) + force_sig(SIGKILL, current); + + return ret; + + out_unlock_release_sem: + queue_unlock(&q, hb); + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; + + uaddr_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock_release_sem; + + goto retry_locked; + } + + queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + + return ret; +} + +/* + * Restart handler + */ +static long futex_lock_pi_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper timeout, *to = NULL; + int ret; + + restart->fn = do_no_restart_syscall; + + if (restart->arg2 || restart->arg3) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | + (u64) restart->arg0; + } + + pr_debug("lock_pi restart: %p, %d (%d)\n", + (u32 __user *)restart->arg0, current->pid); + + ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, + 0, to); + + if (ret != -EINTR) + return ret; + + restart->fn = futex_lock_pi_restart; + + /* The other values are filled in */ + return -ERESTART_RESTARTBLOCK; +} + +/* + * Called from the syscall entry below. + */ +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + long nsec, int trylock) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + int ret; + + if (sec != MAX_SCHEDULE_TIMEOUT) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires = ktime_set(sec, nsec); + } + + ret = do_futex_lock_pi(uaddr, detect, trylock, to); + + if (ret != -EINTR) + return ret; + + pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); + + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->arg0 = (unsigned long) uaddr; + restart->arg1 = detect; + if (to) { + restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = to->timer.expires.tv64 >> 32; + } else + restart->arg2 = restart->arg3 = 0; + + return -ERESTART_RESTARTBLOCK; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + u32 uval; + struct list_head *head; + union futex_key key; + int ret, attempt = 0; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != current->pid) + return -EPERM; + /* + * First take all the futex related locks: + */ + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr, &key); + if (unlikely(ret != 0)) + goto out; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + +retry_locked: + /* + * To avoid races, try to do the TID -> 0 atomic transition + * again. If it succeeds then we can return without waking + * anyone else up: + */ + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + + if (unlikely(uval == -EFAULT)) + goto pi_faulted; + /* + * Rare case: we managed to release the lock atomically, + * no need to wake anyone else up: + */ + if (unlikely(uval == current->pid)) + goto out_unlock; + + /* + * Ok, other tasks may need to be woken up - check waiters + * and do the wakeup if necessary: + */ + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (!match_futex (&this->key, &key)) + continue; + ret = wake_futex_pi(uaddr, uval, this); + /* + * The atomic access to the futex value + * generated a pagefault, so retry the + * user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + goto out_unlock; + } + /* + * No waiters - kernel unlocks the futex: + */ + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + +out_unlock: + spin_unlock(&hb->lock); +out: + up_read(¤t->mm->mmap_sem); + + return ret; + +pi_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock; + + goto retry_locked; + } + + spin_unlock(&hb->lock); up_read(¤t->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + return ret; } @@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, i err = -ENOMEM; goto error; } + q->pi_state = NULL; down_read(¤t->mm->mmap_sem); err = get_futex_key(uaddr, &q->key); @@ -856,7 +1591,7 @@ error: * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after @@ -931,7 +1666,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) { - u32 uval; + u32 uval, nval; retry: if (get_user(uval, uaddr)) @@ -948,8 +1683,12 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - if (futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED) != uval) + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED); + if (nval == -EFAULT) + return -1; + + if (nval != uval) goto retry; if (uval & FUTEX_WAITERS) @@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct while (entry != &head->list) { /* * A pending lock might already be on the list, so - * dont process it twice: + * don't process it twice: */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, @@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; + case FUTEX_LOCK_PI: + ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + break; + case FUTEX_UNLOCK_PI: + ret = futex_unlock_pi(uaddr); + break; + case FUTEX_TRYLOCK_PI: + ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + break; default: ret = -ENOSYS; } @@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *ua unsigned long timeout = MAX_SCHEDULE_TIMEOUT; u32 val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } /* * requeue parameter in 'utime' if op == FUTEX_REQUEUE. */ - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff -puN kernel/futex_compat.c~pi-futex-futex_lock_pi-futex_unlock_pi-support kernel/futex_compat.c --- a/kernel/futex_compat.c~pi-futex-futex_lock_pi-futex_unlock_pi-support +++ a/kernel/futex_compat.c @@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __u unsigned long timeout = MAX_SCHEDULE_TIMEOUT; int val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (get_compat_timespec(&t, utime)) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff -puN kernel/rtmutex_common.h~pi-futex-futex_lock_pi-futex_unlock_pi-support kernel/rtmutex_common.h --- a/kernel/rtmutex_common.h~pi-futex-futex_lock_pi-futex_unlock_pi-support +++ a/kernel/rtmutex_common.h @@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_own return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; } +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); #endif _ Patches currently in -mm which might be from mingo@xxxxxxx are origin.patch disable-debugging-version-of-write_lock.patch cpu_relax-use-in-acpi-lock-fix.patch acpi_srat-needs-acpi.patch lock-validator-fix-ns83820c-irq-flags-bug.patch revert-gregkh-pci-pci-test-that-drivers-properly-call-pci_set_master.patch x86-do_irq-check-irq-number.patch sched-clean-up-fallout-of-recent-changes.patch sched-cleanup-remove-task_t-convert-to-struct-task_struct.patch sched-cleanup-convert-schedc-internal-typedefs-to-struct.patch sched-fix-bug-in-__migrate_task.patch sched-add-above-background-load-function.patch mm-implement-swap-prefetching.patch sched-cleanup-remove-task_t-convert-to-struct-task_struct-prefetch.patch genirq-rename-desc-handler-to-desc-chip.patch genirq-rename-desc-handler-to-desc-chip-power-fix.patch genirq-rename-desc-handler-to-desc-chip-ia64-fix.patch genirq-rename-desc-handler-to-desc-chip-ia64-fix-2.patch genirq-rename-desc-handler-to-desc-chip-terminate_irqs-fix.patch genirq-sem2mutex-probe_sem-probing_active.patch genirq-cleanup-merge-irq_affinity-into-irq_desc.patch genirq-cleanup-remove-irq_descp.patch genirq-cleanup-remove-irq_descp-fix.patch genirq-cleanup-remove-fastcall.patch genirq-cleanup-misc-code-cleanups.patch genirq-cleanup-reduce-irq_desc_t-use-mark-it-obsolete.patch genirq-cleanup-include-linux-irqh.patch genirq-cleanup-merge-irq_dir-smp_affinity_entry-into-irq_desc.patch genirq-cleanup-merge-pending_irq_cpumask-into-irq_desc.patch genirq-cleanup-turn-arch_has_irq_per_cpu-into-config_irq_per_cpu.patch genirq-debug-better-debug-printout-in-enable_irq.patch genirq-add-retrigger-irq-op-to-consolidate-hw_irq_resend.patch genirq-doc-comment-include-linux-irqh-structures.patch genirq-doc-handle_irq_event-and-__do_irq-comments.patch genirq-cleanup-no_irq_type-cleanups.patch genirq-doc-add-design-documentation.patch genirq-add-genirq-sw-irq-retrigger.patch genirq-add-irq_noprobe-support.patch genirq-add-irq_norequest-support.patch genirq-add-irq_noautoen-support.patch genirq-update-copyrights.patch genirq-core.patch genirq-core-revert-noisiness-on-spurious-interrupts.patch genirq-msi-fixes-2.patch genirq-add-irq-chip-support.patch genirq-add-irq-chip-support-fix.patch genirq-add-irq-chip-support-misroute-irq-dont-call-desc-chip-end.patch genirq-add-handle_bad_irq.patch genirq-add-irq-wake-power-management-support.patch genirq-add-sa_trigger-support.patch genirq-cleanup-no_irq_type-no_irq_chip-rename.patch genirq-more-verbose-debugging-on-unexpected-irq-vectors.patch genirq-ia64-build-fix.patch genirq-add-irq_type_sense_mask.patch genirq-add-irq-chip-support-fasteoi-handler-handle-interrupt-disabling.patch genirq-irq-document-what-an-irq-is.patch genirq-add-chip-eoi-fastack-fasteoi-core.patch genirq-add-chip-eoi-fastack-fasteoi-fix.patch lockdep-floppyc-irq-release-fix.patch lockdep-acpi-locking-fix.patch lockdep-console_init-after-local_irq_enable.patch lockdep-add-is_module_address.patch lockdep-add-print_ip_sym.patch lockdep-add-per_cpu_offset.patch lockdep-add-disable-enable_irq_lockdep-api.patch lockdep-add-local_irq_enable_in_hardirq-api.patch lockdep-add-declare_completion_onstack-api.patch lockdep-clean-up-rwsems.patch lockdep-remove-rwsem_debug-remnants.patch lockdep-rename-debug_warn_on.patch lockdep-remove-debug_bug_on.patch lockdep-remove-mutex-deadlock-checking-code.patch lockdep-better-lock-debugging.patch lockdep-mutex-section-binutils-workaround.patch lockdep-locking-init-debugging-improvement.patch lockdep-beautify-x86_64-stacktraces.patch lockdep-x86_64-document-stack-frame-internals.patch lockdep-i386-remove-multi-entry-backtraces.patch lockdep-stacktrace-subsystem-core.patch lockdep-s390-config_frame_pointer-support.patch lockdep-stacktrace-subsystem-i386-support.patch lockdep-stacktrace-subsystem-x86_64-support.patch lockdep-stacktrace-subsystem-s390-support.patch lockdep-irqtrace-subsystem-core.patch lockdep-irqtrace-subsystem-docs.patch lockdep-irqtrace-subsystem-i386-support.patch lockdep-irqtrace-cleanup-of-include-asm-i386-irqflagsh.patch lockdep-irqtrace-subsystem-x86_64-support.patch lockdep-irqtrace-subsystem-x86_64-support-fix.patch lockdep-irqtrace-subsystem-x86_64-support-fix-2.patch lockdep-irqtrace-cleanup-of-include-asm-x86_64-irqflagsh.patch lockdep-irqtrace-subsystem-s390-support.patch lockdep-locking-api-self-tests.patch lockdep-core.patch lockdep-design-docs.patch lockdep-procfs.patch lockdep-prove-rwsem-locking-correctness.patch lockdep-prove-spinlock-rwlock-locking-correctness.patch lockdep-prove-mutex-locking-correctness.patch lockdep-kconfig.patch lockdep-print-all-lock-classes-on-sysrq-d.patch lockdep-x86_64-early-init.patch lockdep-x86-smp-alternatives-workaround.patch lockdep-do-not-recurse-in-printk.patch lockdep-fix-rt_hash_lock_sz.patch lockdep-s390-turn-validator-off-in-machine-check-handler.patch lockdep-enable-on-i386.patch lockdep-enable-on-x86_64.patch lockdep-enable-on-s390.patch lockdep-annotate-direct-io.patch lockdep-annotate-serial.patch lockdep-annotate-dcache.patch lockdep-annotate-i_mutex.patch lockdep-annotate-futex.patch lockdep-annotate-genirq.patch lockdep-annotate-waitqueues.patch lockdep-annotate-mm.patch lockdep-annotate-serio.patch lockdep-annotate-skb_queue_head_init.patch lockdep-annotate-timer-base-locks.patch lockdep-annotate-scheduler-runqueue-locks.patch lockdep-annotate-hrtimer-base-locks.patch lockdep-annotate-sock_lock_init.patch lockdep-annotate-af_unix-locking.patch lockdep-annotate-bh_lock_sock.patch lockdep-annotate-ieee1394-skb-queue-head-locking.patch lockdep-annotate-mmap_sem.patch lockdep-annotate-sunrpc-code.patch lockdep-annotate-ntfs-locking-rules.patch lockdep-annotate-the-quota-code.patch lockdep-annotate-usbfs.patch lockdep-annotate-sound-core-seq-seq_portsc.patch lockdep-annotate-sound-core-seq-seq_devicec.patch lockdep-annotate-8390c-disable_irq.patch lockdep-annotate-3c59xc-disable_irq.patch lockdep-annotate-forcedethc-disable_irq.patch lockdep-annotate-enable_in_hardirq.patch lockdep-annotate-on-stack-completions.patch lockdep-annotate-qeth-driver.patch lockdep-annotate-s_lock.patch lockdep-annotate-sb-s_umount.patch lockdep-annotate-slab-code.patch lockdep-annotate-blkdev-nesting.patch lockdep-annotate-vlan-net-device-as-being-a-special-class.patch lockdep-annotate-vlan-net-device-as-being-a-special-class-fix.patch detect-atomic-counter-underflows.patch debug-shared-irqs.patch make-frame_pointer-default=y.patch mutex-subsystem-synchro-test-module.patch vdso-print-fatal-signals.patch vdso-improve-print_fatal_signals-support-by-adding-memory-maps.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html