On Mon, 13 Jan 2020, Mike Kravetz wrote: > > diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c > > index 35415af9ed26f..b03270b0d5833 100644 > > --- a/mm/hugetlb_cgroup.c > > +++ b/mm/hugetlb_cgroup.c > > @@ -96,8 +96,12 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) > > int idx; > > > > for (idx = 0; idx < hugetlb_max_hstate; idx++) { > > - if (page_counter_read(&h_cg->hugepage[idx])) > > + if (page_counter_read( > > + hugetlb_cgroup_get_counter(h_cg, idx, true)) || > > + page_counter_read( > > + hugetlb_cgroup_get_counter(h_cg, idx, false))) { > > return true; > > + } > > } > > return false; > > } > > @@ -108,18 +112,32 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, > > int idx; > > > > for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { > > - struct page_counter *counter = &h_cgroup->hugepage[idx]; > > - struct page_counter *parent = NULL; > > + struct page_counter *fault_parent = NULL; > > + struct page_counter *reserved_parent = NULL; > > unsigned long limit; > > int ret; > > > > - if (parent_h_cgroup) > > - parent = &parent_h_cgroup->hugepage[idx]; > > - page_counter_init(counter, parent); > > + if (parent_h_cgroup) { > > + fault_parent = hugetlb_cgroup_get_counter( > > + parent_h_cgroup, idx, false); > > + reserved_parent = hugetlb_cgroup_get_counter( > > + parent_h_cgroup, idx, true); > > + } > > + page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx, > > + false), > > + fault_parent); > > + page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx, > > + true), > > + reserved_parent); > > > > limit = round_down(PAGE_COUNTER_MAX, > > 1 << huge_page_order(&hstates[idx])); > > - ret = page_counter_set_max(counter, limit); > > + > > + ret = page_counter_set_max( > > + hugetlb_cgroup_get_counter(h_cgroup, idx, false), > > + limit); > > + ret = page_counter_set_max( > > + hugetlb_cgroup_get_counter(h_cgroup, idx, true), limit); > > VM_BUG_ON(ret); > > The second page_counter_set_max() call overwrites ret before the check in > VM_BUG_ON(). > > > } > > } > > @@ -149,7 +167,6 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) > > kfree(h_cgroup); > > } > > > > - > > /* > > * Should be called with hugetlb_lock held. > > * Since we are holding hugetlb_lock, pages cannot get moved from > > @@ -165,7 +182,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, > > struct hugetlb_cgroup *page_hcg; > > struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); > > > > - page_hcg = hugetlb_cgroup_from_page(page); > > + page_hcg = hugetlb_cgroup_from_page(page, false); > > /* > > * We can have pages in active list without any cgroup > > * ie, hugepage with less than 3 pages. We can safely > > @@ -184,7 +201,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, > > /* Take the pages off the local counter */ > > page_counter_cancel(counter, nr_pages); > > > > - set_hugetlb_cgroup(page, parent); > > + set_hugetlb_cgroup(page, parent, false); > > out: > > return; > > } > > @@ -227,7 +244,7 @@ static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, > > } > > > > int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, > > - struct hugetlb_cgroup **ptr) > > + struct hugetlb_cgroup **ptr, bool reserved) > > { > > int ret = 0; > > struct page_counter *counter; > > @@ -250,13 +267,20 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, > > } > > rcu_read_unlock(); > > > > - if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, > > - &counter)) { > > + if (!page_counter_try_charge(hugetlb_cgroup_get_counter(h_cg, idx, > > + reserved), > > + nr_pages, &counter)) { > > ret = -ENOMEM; > > hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx, > > HUGETLB_MAX); > > + css_put(&h_cg->css); > > + goto done; > > } > > - css_put(&h_cg->css); > > + /* Reservations take a reference to the css because they do not get > > + * reparented. > > I'm hoping someone with more cgroup knowledge can comment on this and any > consequences of not reparenting reservations. We previously talked about > why reparenting would be very difficult/expensive. I understand why you are > nopt doing it. Just do not fully understand what needs to be done from the > cgroup side. > I don't see any description of how hugetlb_cgroup currently acts wrt reparenting in the last patch in the series and how this is the same or different for reservations. I think the discussion that is referenced here is probably lost in some previous posting of the series. I think it's particularly useful information that the end user will need to know about for its handling so it would benefit from some documentation in the last patch.