Re: [PATCH v1] vdpa: Do not count the pages that were already pinned in the vhost-vDPA

Jason Wang <jasowang@xxxxxxxxxx> · Mon, 9 May 2022 16:02:06 +0800

On Mon, May 9, 2022 at 3:15 PM Cindy Lu <lulu@xxxxxxxxxx> wrote:
>
> We count pinned_vm as follow in vhost-vDPA
>
>         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>         if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
>                 ret = -ENOMEM;
>                 goto unlock;
>         }
> This means if we have two vDPA devices for the same VM the pages would be counted twice
> So we add a tree to save the page that counted and we will not count it again
>
> Signed-off-by: Cindy Lu <lulu@xxxxxxxxxx>
> ---
>  drivers/vhost/vdpa.c     | 79 ++++++++++++++++++++++++++++++++++++++--
>  include/linux/mm_types.h |  2 +
>  2 files changed, 78 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index 05f5fd2af58f..48cb5c8264b5 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
> @@ -24,6 +24,9 @@
>  #include <linux/vhost.h>
>
>  #include "vhost.h"
> +#include <linux/rbtree.h>
> +#include <linux/interval_tree.h>
> +#include <linux/interval_tree_generic.h>
>
>  enum {
>         VHOST_VDPA_BACKEND_FEATURES =
> @@ -505,6 +508,50 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
>         mutex_unlock(&d->mutex);
>         return r;
>  }
> +int vhost_vdpa_add_range_ctx(struct rb_root_cached *root, u64 start, u64 last)
> +{
> +       struct interval_tree_node *new_node;
> +
> +       if (last < start)
> +               return -EFAULT;
> +
> +       /* If the range being mapped is [0, ULONG_MAX], split it into two entries
> +        * otherwise its size would overflow u64.
> +        */
> +       if (start == 0 && last == ULONG_MAX) {
> +               u64 mid = last / 2;
> +
> +               vhost_vdpa_add_range_ctx(root, start, mid);
> +               start = mid + 1;
> +       }
> +
> +       new_node = kmalloc(sizeof(struct interval_tree_node), GFP_ATOMIC);
> +       if (!new_node)
> +               return -ENOMEM;
> +
> +       new_node->start = start;
> +       new_node->last = last;
> +
> +       interval_tree_insert(new_node, root);
> +
> +       return 0;
> +}
> +
> +void vhost_vdpa_del_range(struct rb_root_cached *root, u64 start, u64 last)
> +{
> +       struct interval_tree_node *new_node;
> +
> +       while ((new_node = interval_tree_iter_first(root, start, last))) {
> +               interval_tree_remove(new_node, root);
> +               kfree(new_node);
> +       }
> +}
> +
> +struct interval_tree_node *vhost_vdpa_search_range(struct rb_root_cached *root,
> +                                                  u64 start, u64 last)
> +{
> +       return interval_tree_iter_first(root, start, last);
> +}
>
>  static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
>  {
> @@ -513,6 +560,7 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
>         struct vhost_iotlb_map *map;
>         struct page *page;
>         unsigned long pfn, pinned;
> +       struct interval_tree_node *new_node = NULL;
>
>         while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
>                 pinned = PFN_DOWN(map->size);
> @@ -523,7 +571,18 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
>                                 set_page_dirty_lock(page);
>                         unpin_user_page(page);
>                 }
> -               atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
> +
> +               new_node = vhost_vdpa_search_range(&dev->mm->root_for_vdpa,
> +                                                  map->start,
> +                                                  map->start + map->size - 1);
> +
> +               if (new_node) {
> +                       vhost_vdpa_del_range(&dev->mm->root_for_vdpa,
> +                                            map->start,
> +                                            map->start + map->size - 1);
> +                       atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
> +               }
> +
>                 vhost_iotlb_map_free(iotlb, map);
>         }
>  }
> @@ -591,6 +650,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
>         struct vdpa_device *vdpa = v->vdpa;
>         const struct vdpa_config_ops *ops = vdpa->config;
>         int r = 0;
> +       struct interval_tree_node *new_node = NULL;
>
>         r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1,
>                                       pa, perm, opaque);
> @@ -611,9 +671,22 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
>                 return r;
>         }
>
> -       if (!vdpa->use_va)
> -               atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
> +       if (!vdpa->use_va) {
> +               new_node = vhost_vdpa_search_range(&dev->mm->root_for_vdpa,
> +                                                  iova, iova + size - 1);
> +
> +               if (new_node == 0) {
> +                       r = vhost_vdpa_add_range_ctx(&dev->mm->root_for_vdpa,
> +                                                    iova, iova + size - 1);
> +                       if (r) {
> +                               vhost_iotlb_del_range(dev->iotlb, iova,
> +                                                     iova + size - 1);
> +                               return r;
> +                       }
>
> +                       atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
> +               }

This seems not sufficient, consider:

vhost-vDPA-A: add [A, B)
vhost-vDPA-B: add [A, C) (C>B)

We lose the accounting for [B, C)?

> +       }
>         return 0;
>  }
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 5140e5feb486..46eaa6d0560b 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -634,6 +634,8 @@ struct mm_struct {
>  #ifdef CONFIG_IOMMU_SUPPORT
>                 u32 pasid;
>  #endif
> +               struct rb_root_cached root_for_vdpa;
> +

Let's avoid touching mm_structure unless it's a must.

We can allocate something like vhost_mm if needed during SET_OWNER.

Thanks

>         } __randomize_layout;
>
>         /*
> --
> 2.34.1
>

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization