Re: [PATCH 4/7] mm/hugetlb: Clean up map/global resv accounting when allocate

Ackerley Tng <ackerleytng@xxxxxxxxxx> · Sat, 28 Dec 2024 00:06:34 +0000

Peter Xu <peterx@xxxxxxxxxx> writes:

> <snip>
>
> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
> ---
>  mm/hugetlb.c | 116 +++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 80 insertions(+), 36 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index dfd479a857b6..14cfe0bb01e4 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2956,6 +2956,25 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
>  	return ret;
>  }
>
> +typedef enum {
> +	/*
> +	 * For either 0/1: we checked the per-vma resv map, and one resv
> +	 * count either can be reused (0), or an extra needed (1).
> +	 */
> +	MAP_CHG_REUSE = 0,
> +	MAP_CHG_NEEDED = 1,
> +	/*
> +	 * Cannot use per-vma resv count can be used, hence a new resv
> +	 * count is enforced.
> +	 *
> +	 * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
> +	 * that currently vma_needs_reservation() has an unwanted side
> +	 * effect to either use end() or commit() to complete the
> +	 * transaction.	 Hence it needs to differenciate from NEEDED.
> +	 */
> +	MAP_CHG_ENFORCED = 2,
> +} map_chg_state;
> +
>  /*
>   * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
>   * faults of hugetlb private mappings on top of a non-page-cache folio (in
> @@ -2969,12 +2988,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  	struct hugepage_subpool *spool = subpool_vma(vma);
>  	struct hstate *h = hstate_vma(vma);
>  	struct folio *folio;
> -	long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
> -	long gbl_chg;
> +	long retval, gbl_chg, nr_pages = pages_per_huge_page(h);
> +	map_chg_state map_chg;
>  	int memcg_charge_ret, ret, idx;
>  	struct hugetlb_cgroup *h_cg = NULL;
>  	struct mem_cgroup *memcg;
> -	bool deferred_reserve;
>  	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
>
>  	memcg = get_mem_cgroup_from_current();
> @@ -2985,36 +3003,56 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  	}
>
>  	idx = hstate_index(h);
> -	/*
> -	 * Examine the region/reserve map to determine if the process
> -	 * has a reservation for the page to be allocated.  A return
> -	 * code of zero indicates a reservation exists (no change).
> -	 */
> -	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
> -	if (map_chg < 0) {
> -		if (!memcg_charge_ret)
> -			mem_cgroup_cancel_charge(memcg, nr_pages);
> -		mem_cgroup_put(memcg);
> -		return ERR_PTR(-ENOMEM);
> +
> +	/* Whether we need a separate per-vma reservation? */
> +	if (cow_from_owner) {
> +		/*
> +		 * Special case!  Since it's a CoW on top of a reserved
> +		 * page, the private resv map doesn't count.  So it cannot
> +		 * consume the per-vma resv map even if it's reserved.
> +		 */
> +		map_chg = MAP_CHG_ENFORCED;
> +	} else {
> +		/*
> +		 * Examine the region/reserve map to determine if the process
> +		 * has a reservation for the page to be allocated.  A return
> +		 * code of zero indicates a reservation exists (no change).
> +		 */
> +		retval = vma_needs_reservation(h, vma, addr);
> +		if (retval < 0) {
> +			if (!memcg_charge_ret)
> +				mem_cgroup_cancel_charge(memcg, nr_pages);
> +			mem_cgroup_put(memcg);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +		map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
>  	}
>
>  	/*
> +	 * Whether we need a separate global reservation?
> +	 *
>  	 * Processes that did not create the mapping will have no
>  	 * reserves as indicated by the region/reserve map. Check
>  	 * that the allocation will not exceed the subpool limit.
> -	 * Allocations for MAP_NORESERVE mappings also need to be
> -	 * checked against any subpool limit.
> +	 * Or if it can get one from the pool reservation directly.
>  	 */
> -	if (map_chg || cow_from_owner) {
> +	if (map_chg) {
>  		gbl_chg = hugepage_subpool_get_pages(spool, 1);
>  		if (gbl_chg < 0)
>  			goto out_end_reservation;
> +	} else {
> +		/*
> +		 * If we have the vma reservation ready, no need for extra
> +		 * global reservation.
> +		 */
> +		gbl_chg = 0;
>  	}
>
> -	/* If this allocation is not consuming a reservation, charge it now.
> +	/*
> +	 * If this allocation is not consuming a per-vma reservation,
> +	 * charge the hugetlb cgroup now.
>  	 */
> -	deferred_reserve = map_chg || cow_from_owner;
> -	if (deferred_reserve) {
> +	if (map_chg) {
>  		ret = hugetlb_cgroup_charge_cgroup_rsvd(
>  			idx, pages_per_huge_page(h), &h_cg);

Should hugetlb_cgroup_charge_cgroup_rsvd() be called when map_chg == MAP_CHG_ENFORCED?

>  		if (ret)
> @@ -3038,7 +3076,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  		if (!folio)
>  			goto out_uncharge_cgroup;
>  		spin_lock_irq(&hugetlb_lock);
> -		if (!cow_from_owner && vma_has_reserves(vma, gbl_chg)) {
> +		if (vma_has_reserves(vma, gbl_chg)) {
>  			folio_set_hugetlb_restore_reserve(folio);
>  			h->resv_huge_pages--;
>  		}
> @@ -3051,7 +3089,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  	/* If allocation is not consuming a reservation, also store the
>  	 * hugetlb_cgroup pointer on the page.
>  	 */
> -	if (deferred_reserve) {
> +	if (map_chg) {
>  		hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
>  						  h_cg, folio);
>  	}

same for this,

> @@ -3060,26 +3098,31 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>
>  	hugetlb_set_folio_subpool(folio, spool);
>
> -	map_commit = vma_commit_reservation(h, vma, addr);
> -	if (unlikely(map_chg > map_commit)) {
> +	if (map_chg != MAP_CHG_ENFORCED) {
> +		/* commit() is only needed if the map_chg is not enforced */
> +		retval = vma_commit_reservation(h, vma, addr);
>  		/*
> +		 * Check for possible race conditions. When it happens..
>  		 * The page was added to the reservation map between
>  		 * vma_needs_reservation and vma_commit_reservation.
>  		 * This indicates a race with hugetlb_reserve_pages.
>  		 * Adjust for the subpool count incremented above AND
> -		 * in hugetlb_reserve_pages for the same page.  Also,
> +		 * in hugetlb_reserve_pages for the same page.	Also,
>  		 * the reservation count added in hugetlb_reserve_pages
>  		 * no longer applies.
>  		 */
> -		long rsv_adjust;
> +		if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
> +			long rsv_adjust;
>
> -		rsv_adjust = hugepage_subpool_put_pages(spool, 1);
> -		hugetlb_acct_memory(h, -rsv_adjust);
> -		if (deferred_reserve) {
> -			spin_lock_irq(&hugetlb_lock);
> -			hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
> -					pages_per_huge_page(h), folio);
> -			spin_unlock_irq(&hugetlb_lock);
> +			rsv_adjust = hugepage_subpool_put_pages(spool, 1);
> +			hugetlb_acct_memory(h, -rsv_adjust);
> +			if (map_chg) {
> +				spin_lock_irq(&hugetlb_lock);
> +				hugetlb_cgroup_uncharge_folio_rsvd(
> +				    hstate_index(h), pages_per_huge_page(h),
> +				    folio);
> +				spin_unlock_irq(&hugetlb_lock);
> +			}
>  		}
>  	}
>
> @@ -3093,14 +3136,15 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  out_uncharge_cgroup:
>  	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
>  out_uncharge_cgroup_reservation:
> -	if (deferred_reserve)
> +	if (map_chg)
>  		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
>  						    h_cg);

and same for this.

>  out_subpool_put:
> -	if (map_chg || cow_from_owner)
> +	if (map_chg)
>  		hugepage_subpool_put_pages(spool, 1);
>  out_end_reservation:
> -	vma_end_reservation(h, vma, addr);
> +	if (map_chg != MAP_CHG_ENFORCED)
> +		vma_end_reservation(h, vma, addr);
>  	if (!memcg_charge_ret)
>  		mem_cgroup_cancel_charge(memcg, nr_pages);
>  	mem_cgroup_put(memcg);