Re: [PATCH 2/4] hugetlbfs: extend hugetlb_vma_lock to private VMAs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Oct 05, 2023 at 11:59:07PM -0400, riel@xxxxxxxxxxx wrote:
> From: Rik van Riel <riel@xxxxxxxxxxx>
> 
> Extend the locking scheme used to protect shared hugetlb mappings
> from truncate vs page fault races, in order to protect private
> hugetlb mappings (with resv_map) against MADV_DONTNEED.
> 
> Add a read-write semaphore to the resv_map data structure, and
> use that from the hugetlb_vma_(un)lock_* functions, in preparation
> for closing the race between MADV_DONTNEED and page faults.
> 
> Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
> Reviewed-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
> Cc: stable@xxxxxxxxxx
> Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
> ---
>  include/linux/hugetlb.h |  6 ++++++
>  mm/hugetlb.c            | 41 +++++++++++++++++++++++++++++++++++++----
>  2 files changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 5b2626063f4f..694928fa06a3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -60,6 +60,7 @@ struct resv_map {
>  	long adds_in_progress;
>  	struct list_head region_cache;
>  	long region_cache_count;
> +	struct rw_semaphore rw_sema;
>  #ifdef CONFIG_CGROUP_HUGETLB
>  	/*
>  	 * On private mappings, the counter to uncharge reservations is stored
> @@ -1231,6 +1232,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
>  	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
>  }
>  
> +static inline bool __vma_private_lock(struct vm_area_struct *vma)
> +{
> +	return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
> +}
> +
>  /*
>   * Safe version of huge_pte_offset() to check the locks.  See comments
>   * above huge_pte_offset().
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a86e070d735b..dd3de6ec8f1a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
>  static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
>  static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
>  		unsigned long start, unsigned long end);
> +static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
>  
>  static inline bool subpool_is_free(struct hugepage_subpool *spool)
>  {
> @@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		down_read(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		down_read(&resv_map->rw_sema);
>  	}
>  }

+Ackerley +Oscar

I'm reading the resv code recently and just stumbled upon this. So want to
raise this question.

IIUC __vma_private_lock() will return false for MAP_PRIVATE hugetlb vma if
the vma is dup()ed from a fork(), with/without commit 187da0f8250a
("hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write") which fixed a
slightly different issue.

The problem is the current vma lock for private mmap() is based on the resv
map, and the resv map only belongs to the process that mmap()ed this
private vma.  E.g. dup_mmap() has:

                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);

Which does:

	if (vma->vm_flags & VM_MAYSHARE) {
                ...
	} else
		vma->vm_private_data = NULL; <---------------------

So even if I don't know how many of us are even using hugetlb PRIVATE +
fork(), assuming that's the most controversial use case that I'm aware of
on hugetlb that people complains about.. with some tricky changes like
04f2cbe35699.. Just still want to raise this pure question, that after a
fork() on private vma, and if I read it alright, lock/unlock operations may
become noop..

Thanks,

>  
> @@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		up_read(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		up_read(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		down_write(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		down_write(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		up_write(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		up_write(&resv_map->rw_sema);
>  	}
>  }
>  
>  int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
>  {
> -	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
> -	if (!__vma_shareable_lock(vma))
> -		return 1;
> +	if (__vma_shareable_lock(vma)) {
> +		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
> -	return down_write_trylock(&vma_lock->rw_sema);
> +		return down_write_trylock(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		return down_write_trylock(&resv_map->rw_sema);
> +	}
> +
> +	return 1;
>  }
>  
>  void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
> @@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		lockdep_assert_held(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		lockdep_assert_held(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		__hugetlb_vma_unlock_write_put(vma_lock);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		/* no free for anon vmas, but still need to unlock */
> +		up_write(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void)
>  	kref_init(&resv_map->refs);
>  	spin_lock_init(&resv_map->lock);
>  	INIT_LIST_HEAD(&resv_map->regions);
> +	init_rwsem(&resv_map->rw_sema);
>  
>  	resv_map->adds_in_progress = 0;
>  	/*
> -- 
> 2.41.0
> 
> 

-- 
Peter Xu





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux