Re: [RFC PATCH] vm: align vma allocation and move the lock back into the struct

Suren Baghdasaryan <surenb@xxxxxxxxxx> · Thu, 8 Aug 2024 19:38:55 +0000

On Thu, Aug 8, 2024 at 7:00 PM Mateusz Guzik <mjguzik@xxxxxxxxx> wrote:
>
> ACHTUNG: this is more of a request for benchmarking than a patch
> proposal at this stage
>
> I was pointed at your patch which moved the vma lock to a separate
> allocation [1]. The commit message does not say anything about making
> sure the object itself is allocated with proper alignment and I found
> that the cache creation lacks the HWCACHE_ALIGN flag, which may or may
> not be the problem.
>
> I verified with a simple one-liner than on a stock kernel the vmas keep
> roaming around with a 16-byte alignment:
> # bpftrace -e 'kretprobe:vm_area_alloc  { @[retval & 0x3f] = count(); }'
> @[16]: 39
> @[0]: 46
> @[32]: 53
> @[48]: 56
>
> Note the stock vma lock cache also lacks the alignment flag. While I
> have not verified experimentally, if they are also romaing it would mean
> that 2 unrelated vmas can false-share locks. If the patch below is a
> bust, the flag should probably be added to that one.
>
> The patch has slapped-around vma lock cache removal + HWALLOC for the
> vma cache. I left a pointer to not change relative offsets between
> current fields. I does compile without CONFIG_PER_VMA_LOCK.
>
> Vlastimil says you tested a case where the struct got bloated to 256
> bytes, but the lock remained separate. It is unclear to me if this
> happened with allocations made with the HWCACHE_ALIGN flag though.
>
> There is 0 urgency on my end, but it would be nice if you could try
> this out with your test rig.

Hi Mateusz,
Sure, I'll give it a spin but I'm not optimistic. Your code looks
almost identical to my latest attempt where I tried placing vm_lock
into different cachelines including a separate one and using
HWCACHE_ALIGN. And yet all my attempts showed regression.
Just FYI, the test I'm using is the pft-threads test from mmtests
suite. I'll post results today evening.
Thanks,
Suren.

>
> 1: https://lore.kernel.org/all/20230227173632.3292573-34-surenb@xxxxxxxxxx/T/#u
>
> ---
>  include/linux/mm.h       | 18 +++++++--------
>  include/linux/mm_types.h | 10 ++++-----
>  kernel/fork.c            | 47 ++++------------------------------------
>  mm/userfaultfd.c         |  6 ++---
>  4 files changed, 19 insertions(+), 62 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 43b40334e9b2..6d8b668d3deb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -687,7 +687,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>         if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
>                 return false;
>
> -       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> +       if (unlikely(down_read_trylock(&vma->vm_lock) == 0))
>                 return false;
>
>         /*
> @@ -702,7 +702,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>          * This pairs with RELEASE semantics in vma_end_write_all().
>          */
>         if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> -               up_read(&vma->vm_lock->lock);
> +               up_read(&vma->vm_lock);
>                 return false;
>         }
>         return true;
> @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  static inline void vma_end_read(struct vm_area_struct *vma)
>  {
>         rcu_read_lock(); /* keeps vma alive till the end of up_read */
> -       up_read(&vma->vm_lock->lock);
> +       up_read(&vma->vm_lock);
>         rcu_read_unlock();
>  }
>
> @@ -740,7 +740,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>         if (__is_vma_write_locked(vma, &mm_lock_seq))
>                 return;
>
> -       down_write(&vma->vm_lock->lock);
> +       down_write(&vma->vm_lock);
>         /*
>          * We should use WRITE_ONCE() here because we can have concurrent reads
>          * from the early lockless pessimistic check in vma_start_read().
> @@ -748,7 +748,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
>          */
>         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> -       up_write(&vma->vm_lock->lock);
> +       up_write(&vma->vm_lock);
>  }
>
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> @@ -760,7 +760,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>
>  static inline void vma_assert_locked(struct vm_area_struct *vma)
>  {
> -       if (!rwsem_is_locked(&vma->vm_lock->lock))
> +       if (!rwsem_is_locked(&vma->vm_lock))
>                 vma_assert_write_locked(vma);
>  }
>
> @@ -827,10 +827,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
>
>  extern const struct vm_operations_struct vma_dummy_vm_ops;
>
> -/*
> - * WARNING: vma_init does not initialize vma->vm_lock.
> - * Use vm_area_alloc()/vm_area_free() if vma needs locking.
> - */
>  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>  {
>         memset(vma, 0, sizeof(*vma));
> @@ -839,6 +835,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
>         INIT_LIST_HEAD(&vma->anon_vma_chain);
>         vma_mark_detached(vma, false);
>         vma_numab_state_init(vma);
> +       init_rwsem(&vma->vm_lock);
> +       vma->vm_lock_seq = -1;
>  }
>
>  /* Use when VMA is not part of the VMA tree and needs no locking */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 003619fab20e..caffdb4eeb94 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -615,10 +615,6 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
>  }
>  #endif
>
> -struct vma_lock {
> -       struct rw_semaphore lock;
> -};
> -
>  struct vma_numab_state {
>         /*
>          * Initialised as time in 'jiffies' after which VMA
> @@ -716,8 +712,7 @@ struct vm_area_struct {
>          * slowpath.
>          */
>         int vm_lock_seq;
> -       /* Unstable RCU readers are allowed to read this. */
> -       struct vma_lock *vm_lock;
> +       void *vm_dummy;
>  #endif
>
>         /*
> @@ -770,6 +765,9 @@ struct vm_area_struct {
>         struct vma_numab_state *numab_state;    /* NUMA Balancing state */
>  #endif
>         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> +#ifdef CONFIG_PER_VMA_LOCK
> +       struct rw_semaphore vm_lock ____cacheline_aligned_in_smp;
> +#endif
>  } __randomize_layout;
>
>  #ifdef CONFIG_NUMA
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 92bfe56c9fed..eab04a24d5f1 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep;
>  /* SLAB cache for mm_struct structures (tsk->mm) */
>  static struct kmem_cache *mm_cachep;
>
> -#ifdef CONFIG_PER_VMA_LOCK
> -
> -/* SLAB cache for vm_area_struct.lock */
> -static struct kmem_cache *vma_lock_cachep;
> -
> -static bool vma_lock_alloc(struct vm_area_struct *vma)
> -{
> -       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
> -       if (!vma->vm_lock)
> -               return false;
> -
> -       init_rwsem(&vma->vm_lock->lock);
> -       vma->vm_lock_seq = -1;
> -
> -       return true;
> -}
> -
> -static inline void vma_lock_free(struct vm_area_struct *vma)
> -{
> -       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
> -}
> -
> -#else /* CONFIG_PER_VMA_LOCK */
> -
> -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
> -static inline void vma_lock_free(struct vm_area_struct *vma) {}
> -
> -#endif /* CONFIG_PER_VMA_LOCK */
> -
>  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>  {
>         struct vm_area_struct *vma;
> @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>                 return NULL;
>
>         vma_init(vma, mm);
> -       if (!vma_lock_alloc(vma)) {
> -               kmem_cache_free(vm_area_cachep, vma);
> -               return NULL;
> -       }
>
>         return vma;
>  }
> @@ -496,10 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>          * will be reinitialized.
>          */
>         data_race(memcpy(new, orig, sizeof(*new)));
> -       if (!vma_lock_alloc(new)) {
> -               kmem_cache_free(vm_area_cachep, new);
> -               return NULL;
> -       }
> +       init_rwsem(&new->vm_lock);
> +       new->vm_lock_seq = -1;
>         INIT_LIST_HEAD(&new->anon_vma_chain);
>         vma_numab_state_init(new);
>         dup_anon_vma_name(orig, new);
> @@ -511,7 +476,6 @@ void __vm_area_free(struct vm_area_struct *vma)
>  {
>         vma_numab_state_free(vma);
>         free_anon_vma_name(vma);
> -       vma_lock_free(vma);
>         kmem_cache_free(vm_area_cachep, vma);
>  }
>
> @@ -522,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
>                                                   vm_rcu);
>
>         /* The vma should not be locked while being destroyed. */
> -       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
> +       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock), vma);
>         __vm_area_free(vma);
>  }
>  #endif
> @@ -3192,10 +3156,7 @@ void __init proc_caches_init(void)
>                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
>                         NULL);
>
> -       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
> -#ifdef CONFIG_PER_VMA_LOCK
> -       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
> -#endif
> +       vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT|SLAB_HWCACHE_ALIGN);
>         mmap_init();
>         nsproxy_cache_init();
>  }
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 3b7715ecf292..e95ecb2063d2 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -92,7 +92,7 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
>                  * mmap_lock, which guarantees that nobody can lock the
>                  * vma for write (vma_start_write()) under us.
>                  */
> -               down_read(&vma->vm_lock->lock);
> +               down_read(&vma->vm_lock);
>         }
>
>         mmap_read_unlock(mm);
> @@ -1468,9 +1468,9 @@ static int uffd_move_lock(struct mm_struct *mm,
>                  * See comment in uffd_lock_vma() as to why not using
>                  * vma_start_read() here.
>                  */
> -               down_read(&(*dst_vmap)->vm_lock->lock);
> +               down_read(&(*dst_vmap)->vm_lock);
>                 if (*dst_vmap != *src_vmap)
> -                       down_read_nested(&(*src_vmap)->vm_lock->lock,
> +                       down_read_nested(&(*src_vmap)->vm_lock,
>                                          SINGLE_DEPTH_NESTING);
>         }
>         mmap_read_unlock(mm);
> --
> 2.43.0
>