For shared mappings, the pointer to the hugetlb_cgroup to uncharge lives in the resv_map entries, in file_region->reservation_counter. When a file_region entry is added to the resv_map via region_add, we also charge the appropriate hugetlb_cgroup and put the pointer to that in file_region->reservation_counter. This is slightly delicate since we need to not modify the resv_map until we know that charging the reservation has succeeded. If charging doesn't succeed, we report the error to the caller, so that the kernel fails the reservation. On region_del, which is when the hugetlb memory is unreserved, we delete the file_region entry in the resv_map, but also uncharge the file_region->reservation_counter. --- mm/hugetlb.c | 208 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 170 insertions(+), 38 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 235996aef6618..d76e3137110ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -242,8 +242,72 @@ struct file_region { struct list_head link; long from; long to; +#ifdef CONFIG_CGROUP_HUGETLB + /* + * On shared mappings, each reserved region appears as a struct + * file_region in resv_map. These fields hold the info needed to + * uncharge each reservation. + */ + struct page_counter *reservation_counter; + unsigned long pages_per_hpage; +#endif }; +/* Must be called with resv->lock held. Calling this with dry_run == true will + * count the number of pages added but will not modify the linked list. + */ +static long consume_regions_we_overlap_with(struct file_region *rg, + struct list_head *head, long f, long *t, + struct hugetlb_cgroup *h_cg, + struct hstate *h, + bool dry_run) +{ + long add = 0; + struct file_region *trg = NULL, *nrg = NULL; + + /* Consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > *t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. + */ + if (rg->to > *t) + *t = rg->to; + if (rg != nrg) { + /* Decrement return value by the deleted range. + * Another range will span this area so that by + * end of routine add will be >= zero + */ + add -= (rg->to - rg->from); + if (!dry_run) { + list_del(&rg->link); + kfree(rg); + } + } + } + + add += (nrg->from - f); /* Added to beginning of region */ + add += *t - nrg->to; /* Added to end of region */ + + if (!dry_run) { + nrg->from = f; + nrg->to = *t; +#ifdef CONFIG_CGROUP_HUGETLB + nrg->reservation_counter = + &h_cg->reserved_hugepage[hstate_index(h)]; + nrg->pages_per_hpage = pages_per_huge_page(h); +#endif + } + + return add; +} + /* * Add the huge page range represented by [f, t) to the reserve * map. In the normal case, existing regions will be expanded @@ -258,11 +322,13 @@ struct file_region { * Return the number of new huge pages added to the map. This * number is greater than or equal to zero. */ -static long region_add(struct resv_map *resv, long f, long t) +static long region_add(struct hstate *h, struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg, *trg; - long add = 0; + struct file_region *rg, *nrg; + long add = 0, newadd = 0; + struct hugetlb_cgroup *h_cg = NULL; + int ret = 0; spin_lock(&resv->lock); /* Locate the region we are either in or before. */ @@ -277,6 +343,23 @@ static long region_add(struct resv_map *resv, long f, long t) * from the cache and use it for this range. */ if (&rg->link == head || t < rg->from) { +#ifdef CONFIG_CGROUP_HUGETLB + /* + * If res->reservation_counter is NULL, then it means this is + * a shared mapping, and hugetlb cgroup accounting should be + * done on the file_region entries inside resv_map. + */ + if (!resv->reservation_counter) { + ret = hugetlb_cgroup_charge_cgroup( + hstate_index(h), + (t - f) * pages_per_huge_page(h), + &h_cg, true); + } + + if (ret) + goto out_locked; +#endif + VM_BUG_ON(resv->region_cache_count <= 0); resv->region_cache_count--; @@ -286,6 +369,15 @@ static long region_add(struct resv_map *resv, long f, long t) nrg->from = f; nrg->to = t; + +#ifdef CONFIG_CGROUP_HUGETLB + if (h_cg) { + nrg->reservation_counter = + &h_cg->reserved_hugepage[hstate_index(h)]; + nrg->pages_per_hpage = pages_per_huge_page(h); + } +#endif + list_add(&nrg->link, rg->link.prev); add += t - f; @@ -296,38 +388,37 @@ static long region_add(struct resv_map *resv, long f, long t) if (f > rg->from) f = rg->from; - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - /* Decrement return value by the deleted range. - * Another range will span this area so that by - * end of routine add will be >= zero - */ - add -= (rg->to - rg->from); - list_del(&rg->link); - kfree(rg); - } +#ifdef CONFIG_CGROUP_HUGETLB + /* Count any regions we now overlap with. */ + add = consume_regions_we_overlap_with(rg, head, f, &t, NULL, NULL, + true); + + if (!resv->reservation_counter) { + ret = hugetlb_cgroup_charge_cgroup( + hstate_index(h), + add * pages_per_huge_page(h), + &h_cg, true); } - add += (nrg->from - f); /* Added to beginning of region */ - nrg->from = f; - add += t - nrg->to; /* Added to end of region */ - nrg->to = t; + if (ret) + goto out_locked; +#endif + + /* Check for and consume any regions we now overlap with. */ + newadd = consume_regions_we_overlap_with(rg, head, f, &t, h_cg, h, + false); + /* + * If these aren't equal, then there is a bug with + * consume_regions_we_overlap_with, and we're charging the wrong amount + * of memory. + */ + WARN_ON(add != newadd); out_locked: resv->adds_in_progress--; spin_unlock(&resv->lock); + if (ret) + return ret; VM_BUG_ON(add < 0); return add; } @@ -487,6 +578,10 @@ static long region_del(struct resv_map *resv, long f, long t) struct file_region *rg, *trg; struct file_region *nrg = NULL; long del = 0; +#ifdef CONFIG_CGROUP_HUGETLB + struct page_counter *reservation_counter = NULL; + unsigned long pages_per_hpage = 0; +#endif retry: spin_lock(&resv->lock); @@ -514,6 +609,14 @@ static long region_del(struct resv_map *resv, long f, long t) nrg = list_first_entry(&resv->region_cache, struct file_region, link); +#ifdef CONFIG_CGROUP_HUGETLB + /* + * Save counter information from the deleted + * node, in case we need to do an uncharge. + */ + reservation_counter = nrg->reservation_counter; + pages_per_hpage = nrg->pages_per_hpage; +#endif list_del(&nrg->link); resv->region_cache_count--; } @@ -543,6 +646,14 @@ static long region_del(struct resv_map *resv, long f, long t) if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; +#ifdef CONFIG_CGROUP_HUGETLB + /* + * Save counter information from the deleted node, + * in case we need to do an uncharge. + */ + reservation_counter = rg->reservation_counter; + pages_per_hpage = rg->pages_per_hpage; +#endif list_del(&rg->link); kfree(rg); continue; @@ -559,6 +670,19 @@ static long region_del(struct resv_map *resv, long f, long t) spin_unlock(&resv->lock); kfree(nrg); +#ifdef CONFIG_CGROUP_HUGETLB + /* + * If resv->reservation_counter is NULL, then this is shared + * reservation, and the reserved memory is tracked in the file_struct + * entries inside of resv_map. So we need to uncharge the memory here. + */ + if (reservation_counter && pages_per_hpage && del > 0 && + !resv->reservation_counter) { + hugetlb_cgroup_uncharge_counter( + reservation_counter, + del * pages_per_hpage); + } +#endif return del; } @@ -1930,7 +2054,7 @@ static long __vma_reservation_common(struct hstate *h, ret = region_chg(resv, idx, idx + 1); break; case VMA_COMMIT_RESV: - ret = region_add(resv, idx, idx + 1); + ret = region_add(h, resv, idx, idx + 1); break; case VMA_END_RESV: region_abort(resv, idx, idx + 1); @@ -1938,7 +2062,7 @@ static long __vma_reservation_common(struct hstate *h, break; case VMA_ADD_RESV: if (vma->vm_flags & VM_MAYSHARE) - ret = region_add(resv, idx, idx + 1); + ret = region_add(h, resv, idx, idx + 1); else { region_abort(resv, idx, idx + 1); ret = region_del(resv, idx, idx + 1); @@ -4536,7 +4660,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long ret, chg; + long ret, chg, add; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -4624,9 +4748,7 @@ int hugetlb_reserve_pages(struct inode *inode, */ ret = hugetlb_acct_memory(h, gbl_reserve); if (ret < 0) { - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); - goto out_err; + goto out_put_pages; } /* @@ -4641,7 +4763,12 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) { - long add = region_add(resv_map, from, to); + add = region_add(h, resv_map, from, to); + if (add < 0) { + ret = -ENOMEM; + goto out_acct_memory; + } + if (unlikely(chg > add)) { /* @@ -4659,10 +4786,15 @@ int hugetlb_reserve_pages(struct inode *inode, } } return 0; +out_acct_memory: + hugetlb_acct_memory(h, -gbl_reserve); +out_put_pages: + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); out_err: if (!vma || vma->vm_flags & VM_MAYSHARE) - /* Don't call region_abort if region_chg failed */ - if (chg >= 0) + /* Don't call region_abort if region_chg or region_add failed */ + if (chg >= 0 && add >= 0) region_abort(resv_map, from, to); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); -- 2.23.0.rc1.153.gdeed80330f-goog