On Fri, Jul 9, 2021 at 1:02 AM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: > > On Thu, Jul 08, 2021 at 12:46:37PM -0700, Peter Oskolkov wrote: > > > +static inline int umcg_atomic_cmpxchg_64(u64 *uval, u64 __user *uaddr, > > + u64 oldval, u64 newval) > > +{ > > + int ret = 0; > > + > > + if (!user_access_begin(uaddr, sizeof(u64))) > > + return -EFAULT; > > + asm volatile("\n" > > + "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" > > + "2:\n" > > + "\t.section .fixup, \"ax\"\n" > > + "3:\tmov %3, %0\n" > > + "\tjmp 2b\n" > > + "\t.previous\n" > > + _ASM_EXTABLE_UA(1b, 3b) > > + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) > > + : "i" (-EFAULT), "r" (newval), "1" (oldval) > > + : "memory" > > + ); > > + user_access_end(); > > + *uval = oldval; > > + return ret; > > +} > > > +static inline int fix_pagefault(unsigned long uaddr, bool write_fault) > > +{ > > + struct mm_struct *mm = current->mm; > > + int ret; > > + > > + mmap_read_lock(mm); > > + ret = fixup_user_fault(mm, uaddr, write_fault ? FAULT_FLAG_WRITE : 0, > > + NULL); > > + mmap_read_unlock(mm); > > + > > + return ret < 0 ? ret : 0; > > +} > > > +static inline int umcg_cmpxchg_64_user(u64 __user *uaddr, u64 *prev, u64 val) > > +{ > > + while (true) { > > + int ret; > > + u64 expected = *prev; > > + > > + pagefault_disable(); > > + ret = umcg_atomic_cmpxchg_64(prev, uaddr, expected, val); > > + pagefault_enable(); > > + > > + if (!ret) > > + return *prev == expected ? 0 : -EAGAIN; > > + > > + if (WARN_ONCE(ret != -EFAULT, "Unexpected error")) > > + return -EFAULT; > > + > > + ret = fix_pagefault((unsigned long)uaddr, true); > > + if (ret) > > + return -EFAULT; > > + } > > +} > > + > > +/** > > + * atomic_stack_push_user - push a node onto the stack > > + * @head - a pointer to the head of the stack; > > + * @node - a pointer to the node to push. > > + * > > + * Push a node onto a single-linked list (stack). Atomicity/correctness > > + * is guaranteed by locking the head via settings its first bit (assuming > > + * the pointer is aligned). > > + * > > + * Return: 0 on success, -EFAULT on error. > > + */ > > +static inline int atomic_stack_push_user(u64 __user *head, u64 __user *node) > > +{ > > + while (true) { > > + int ret; > > + u64 first; > > + > > + smp_mb(); /* Make the read below clean. */ > > + if (get_user(first, head)) > > + return -EFAULT; > > + > > + if (first & 1UL) { > > + cpu_relax(); > > + continue; /* first is being deleted. */ > > + } > > + > > + if (put_user(first, node)) > > + return -EFAULT; > > + smp_mb(); /* Make the write above visible. */ > > + > > + ret = umcg_cmpxchg_64_user(head, &first, (u64)node); > > + if (!ret) > > + return 0; > > + > > + if (ret == -EAGAIN) { > > + cpu_relax(); > > + continue; > > + } > > + > > + if (WARN_ONCE(ret != -EFAULT, "unexpected umcg_cmpxchg result")) > > + return -EFAULT; > > + > > + return -EFAULT; > > + } > > +} > > > This is horrible... Jann is absolutely right, you do not, *ever* do > userspace spinlocks. What's wrong with the trivial lockless single > linked list approach?. I'm not sure how to get around the ABA problem with a lockless single linked list: https://en.wikipedia.org/wiki/ABA_problem Our semantics can probably be relied on to prevent ABA from happening with idle workers (popped/removed by the userspace), but idle servers can trivially have it: (initial state): head => Server A => Server B => NULL task1 : - read head (get A), read A (get B), {/* delayed */} tasks 2-3-4: - pop A, pop B, push A task 1: - cmp_xchg(head, A /* old */, B /* new */) - succeeds, now B is in the list erroneously > > On top of that, you really want to avoid all that endless stac/clac > nonsense and only have that once, at the outer edges of things. > > Something like the *completely* untested below (except it needs lots of > extra gunk to support compilers without asm-goto-output, and more widths > and ... I'll try the approach you suggest below once it is clear how to deal with the ABA issue (and the wake server issue raised in patch 3). > > > #define __try_cmpxchg_user_size(ptr, oldp, new, size, label) \ > ({ \ > _Bool __success; \ > __chk_user_ptr(ptr); \ > __typeof__(ptr) _old = (__typeof__(ptr))(oldp); \ > __typeof__(*(ptr)) __old = *_old; \ > __typeof__(*(ptr)) __new = (new); \ > switch (size) { \ > case 8: \ > volatile u64 *__ptr = (volatile u64 *)(ptr); \ > asm_volatile_goto("1: " LOCK_PREFIX "cmpxchgq %[new], %[ptr]" \ > CC_SET(z) \ > _ASM_EXTABLE_UA(1b, %l[label]) \ > : CC_OUT(x) (__success), \ > [ptr] "+m" (*__ptr), \ > [old] "+a" (__old), \ > : [new] "r" (__new) \ > : "memory" \ > : label); \ > break; \ > } \ > if (unlikely(!success)) \ > *_old = __old; \ > __success; \ > }) > > #define unsafe_try_cmpxchg_user(ptr, oldp, new, label) \ > __try_cmpxchg_user_size((ptr), (oldp), (new), sizeof(*(ptr)), label); > > int user_llist_add(u64 __user *new, u64 __user *head) > { > u64 first; > int ret; > > if (unlikely(!access_ok(new, sizeof(*new)) || > !access_ok(head, sizeof(*head)))) > return -EFAULT; > > again: > __uaccess_begin_nospec(); > > unsafe_get_user(first, head, Efault_head); > do { > unsafe_put_user(first, new, Efault_new); > } while (!unsafe_try_cmpxchg_user(head, &first, new, Efault_head)); > > user_access_end(); > > return 0; > > Efault_new: > user_access_end(); > > ret = fixup_fault(new); > if (ret < 0) > return ret; > > goto again; > > Efault_head: > user_access_end(); > > ret = fixup_fault(head); > if (ret < 0) > return ret; > > goto again; > }