Re: [RFC PATCH 2/3 v0.2] sched/umcg: RFC: add userspace atomic helpers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jul 9, 2021 at 1:02 AM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Thu, Jul 08, 2021 at 12:46:37PM -0700, Peter Oskolkov wrote:
>
> > +static inline int umcg_atomic_cmpxchg_64(u64 *uval, u64 __user *uaddr,
> > +                                             u64 oldval, u64 newval)
> > +{
> > +     int ret = 0;
> > +
> > +     if (!user_access_begin(uaddr, sizeof(u64)))
> > +             return -EFAULT;
> > +     asm volatile("\n"
> > +             "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n"
> > +             "2:\n"
> > +             "\t.section .fixup, \"ax\"\n"
> > +             "3:\tmov     %3, %0\n"
> > +             "\tjmp     2b\n"
> > +             "\t.previous\n"
> > +             _ASM_EXTABLE_UA(1b, 3b)
> > +             : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
> > +             : "i" (-EFAULT), "r" (newval), "1" (oldval)
> > +             : "memory"
> > +     );
> > +     user_access_end();
> > +     *uval = oldval;
> > +     return ret;
> > +}
>
> > +static inline int fix_pagefault(unsigned long uaddr, bool write_fault)
> > +{
> > +     struct mm_struct *mm = current->mm;
> > +     int ret;
> > +
> > +     mmap_read_lock(mm);
> > +     ret = fixup_user_fault(mm, uaddr, write_fault ? FAULT_FLAG_WRITE : 0,
> > +                     NULL);
> > +     mmap_read_unlock(mm);
> > +
> > +     return ret < 0 ? ret : 0;
> > +}
>
> > +static inline int umcg_cmpxchg_64_user(u64 __user *uaddr, u64 *prev, u64 val)
> > +{
> > +     while (true) {
> > +             int ret;
> > +             u64 expected = *prev;
> > +
> > +             pagefault_disable();
> > +             ret = umcg_atomic_cmpxchg_64(prev, uaddr, expected, val);
> > +             pagefault_enable();
> > +
> > +             if (!ret)
> > +                     return *prev == expected ? 0 : -EAGAIN;
> > +
> > +             if (WARN_ONCE(ret != -EFAULT, "Unexpected error"))
> > +                     return -EFAULT;
> > +
> > +             ret = fix_pagefault((unsigned long)uaddr, true);
> > +             if (ret)
> > +                     return -EFAULT;
> > +     }
> > +}
> > +
> > +/**
> > + * atomic_stack_push_user - push a node onto the stack
> > + * @head - a pointer to the head of the stack;
> > + * @node - a pointer to the node to push.
> > + *
> > + * Push a node onto a single-linked list (stack). Atomicity/correctness
> > + * is guaranteed by locking the head via settings its first bit (assuming
> > + * the pointer is aligned).
> > + *
> > + * Return: 0 on success, -EFAULT on error.
> > + */
> > +static inline int atomic_stack_push_user(u64 __user *head, u64 __user *node)
> > +{
> > +     while (true) {
> > +             int ret;
> > +             u64 first;
> > +
> > +             smp_mb();  /* Make the read below clean. */
> > +             if (get_user(first, head))
> > +                     return -EFAULT;
> > +
> > +             if (first & 1UL) {
> > +                     cpu_relax();
> > +                     continue;  /* first is being deleted. */
> > +             }
> > +
> > +             if (put_user(first, node))
> > +                     return -EFAULT;
> > +             smp_mb();  /* Make the write above visible. */
> > +
> > +             ret = umcg_cmpxchg_64_user(head, &first, (u64)node);
> > +             if (!ret)
> > +                     return 0;
> > +
> > +             if (ret == -EAGAIN) {
> > +                     cpu_relax();
> > +                     continue;
> > +             }
> > +
> > +             if (WARN_ONCE(ret != -EFAULT, "unexpected umcg_cmpxchg result"))
> > +                     return -EFAULT;
> > +
> > +             return -EFAULT;
> > +     }
> > +}
>
>
> This is horrible... Jann is absolutely right, you do not, *ever* do
> userspace spinlocks. What's wrong with the trivial lockless single
> linked list approach?.

I'm not sure how to get around the ABA problem with a lockless single
linked list: https://en.wikipedia.org/wiki/ABA_problem

Our semantics can probably be relied on to prevent ABA from happening
with idle workers (popped/removed by the userspace), but idle servers
can trivially have it:

(initial state): head => Server A => Server B => NULL

task1 :
- read head (get A), read A (get B), {/* delayed */}

tasks 2-3-4:
- pop A, pop B, push A

task 1:
- cmp_xchg(head, A /* old */, B /* new */) - succeeds, now B is in the
list erroneously

>
> On top of that, you really want to avoid all that endless stac/clac
> nonsense and only have that once, at the outer edges of things.
>
> Something like the *completely* untested below (except it needs lots of
> extra gunk to support compilers without asm-goto-output, and more widths
> and ...

I'll try the approach you suggest below once it is clear how to deal
with the ABA issue (and the wake server issue raised in patch 3).

>
>
> #define __try_cmpxchg_user_size(ptr, oldp, new, size, label)            \
> ({                                                                      \
>         _Bool __success;                                                \
>         __chk_user_ptr(ptr);                                            \
>         __typeof__(ptr) _old = (__typeof__(ptr))(oldp);                 \
>         __typeof__(*(ptr)) __old = *_old;                               \
>         __typeof__(*(ptr)) __new = (new);                               \
>         switch (size) {                                                 \
>         case 8:                                                         \
>                 volatile u64 *__ptr = (volatile u64 *)(ptr);            \
>                 asm_volatile_goto("1: " LOCK_PREFIX "cmpxchgq %[new], %[ptr]" \
>                                   CC_SET(z)                             \
>                                   _ASM_EXTABLE_UA(1b, %l[label])        \
>                                   : CC_OUT(x) (__success),              \
>                                     [ptr] "+m" (*__ptr),                \
>                                     [old] "+a" (__old),                 \
>                                   : [new] "r" (__new)                   \
>                                   : "memory"                            \
>                                   : label);                             \
>                 break;                                                  \
>         }                                                               \
>         if (unlikely(!success))                                         \
>                 *_old = __old;                                          \
>         __success;                                                      \
> })
>
> #define unsafe_try_cmpxchg_user(ptr, oldp, new, label)                  \
>         __try_cmpxchg_user_size((ptr), (oldp), (new), sizeof(*(ptr)), label);
>
> int user_llist_add(u64 __user *new, u64 __user *head)
> {
>         u64 first;
>         int ret;
>
>         if (unlikely(!access_ok(new, sizeof(*new)) ||
>                      !access_ok(head, sizeof(*head))))
>                 return -EFAULT;
>
> again:
>         __uaccess_begin_nospec();
>
>         unsafe_get_user(first, head, Efault_head);
>         do {
>                 unsafe_put_user(first, new, Efault_new);
>         } while (!unsafe_try_cmpxchg_user(head, &first, new, Efault_head));
>
>         user_access_end();
>
>         return 0;
>
> Efault_new:
>         user_access_end();
>
>         ret = fixup_fault(new);
>         if (ret < 0)
>                 return ret;
>
>         goto again;
>
> Efault_head:
>         user_access_end();
>
>         ret = fixup_fault(head);
>         if (ret < 0)
>                 return ret;
>
>         goto again;
> }



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux