On 9/6/19 10:27 AM, Michal Hocko wrote: > On Fri 06-09-19 01:11:53, Thomas Lindroth wrote: >> On 9/4/19 6:39 PM, Tetsuo Handa wrote: >>> On 2019/09/04 23:29, Michal Hocko wrote: >>>> Ohh, right. We are trying to uncharge something that hasn't been charged >>>> because page_counter_try_charge has failed. So the fix needs to be more >>>> involved. Sorry, I should have realized that. >>> >>> OK. Survived the test. Thomas, please try. >>> >>>> --- >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >>>> index 9ec5e12486a7..e18108b2b786 100644 >>>> --- a/mm/memcontrol.c >>>> +++ b/mm/memcontrol.c >>>> @@ -2821,6 +2821,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, >>>> if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && >>>> !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { >>>> + >>>> + /* >>>> + * Enforce __GFP_NOFAIL allocation because callers are not >>>> + * prepared to see failures and likely do not have any failure >>>> + * handling code. >>>> + */ >>>> + if (gfp & __GFP_NOFAIL) { >>>> + page_counter_charge(&memcg->kmem, nr_pages); >>>> + return 0; >>>> + } >>>> cancel_charge(memcg, nr_pages); >>>> return -ENOMEM; >>>> } >>>> >> >> I tried the patch with 5.2.11 and wasn't able to trigger any null pointer >> deref crashes with it. Testing is tricky because the OOM killer will still >> run and eventually kill bash and whatever runs in the cgroup. > > Yeah, this is unfortunate but also unfixable I am afraid. I think there are two possible ways to fix this. If we decide to keep kmem.limit_in_bytes broken, than we can just always bypass limit. Also we could add something like pr_warn_once("kmem limit doesn't work"); when user changes kmem.limit_in_bytes Or we can fix kmem.limit_in_bytes like this: --- mm/memcontrol.c | 76 +++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d1d598d9554..71b9065e4b31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1314,7 +1314,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, * Returns the maximum amount of memory @mem can be charged with, in * pages. */ -static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) +static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg, bool kmem) { unsigned long margin = 0; unsigned long count; @@ -1334,6 +1334,15 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) margin = 0; } + if (kmem && margin) { + count = page_counter_read(&memcg->kmem); + limit = READ_ONCE(memcg->kmem.max); + if (count <= limit) + margin = min(margin, limit - count); + else + margin = 0; + } + return margin; } @@ -2505,7 +2514,7 @@ void mem_cgroup_handle_over_high(void) } static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages) + unsigned int nr_pages, bool kmem_charge) { unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -2519,21 +2528,42 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (mem_cgroup_is_root(memcg)) return 0; retry: - if (consume_stock(memcg, nr_pages)) + if (consume_stock(memcg, nr_pages)) { + if (kmem_charge && !page_counter_try_charge(&memcg->kmem, + nr_pages, &counter)) { + refill_stock(memcg, nr_pages); + goto charge; + } return 0; + } +charge: + mem_over_limit = NULL; if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { - if (page_counter_try_charge(&memcg->memory, batch, &counter)) - goto done_restock; - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, batch); - mem_over_limit = mem_cgroup_from_counter(counter, memory); + if (!page_counter_try_charge(&memcg->memory, batch, &counter)) { + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, batch); + mem_over_limit = mem_cgroup_from_counter(counter, memory); + } } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); may_swap = false; } + if (!mem_over_limit && kmem_charge) { + if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + may_swap = false; + mem_over_limit = mem_cgroup_from_counter(counter, kmem); + page_counter_uncharge(&memcg->memory, batch); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, batch); + } + } + + if (!mem_over_limit) + goto done_restock; + if (batch > nr_pages) { batch = nr_pages; goto retry; @@ -2568,7 +2598,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); - if (mem_cgroup_margin(mem_over_limit) >= nr_pages) + if (mem_cgroup_margin(mem_over_limit, kmem_charge) >= nr_pages) goto retry; if (!drained) { @@ -2637,6 +2667,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); + if (kmem_charge) + page_counter_charge(&memcg->kmem, nr_pages); css_get_many(&memcg->css, nr_pages); return 0; @@ -2943,20 +2975,8 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg) { - unsigned int nr_pages = 1 << order; - struct page_counter *counter; - int ret; - - ret = try_charge(memcg, gfp, nr_pages); - if (ret) - return ret; - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && - !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - cancel_charge(memcg, nr_pages); - return -ENOMEM; - } - return 0; + return try_charge(memcg, gfp, 1 << order, + !cgroup_subsys_on_dfl(memory_cgrp_subsys)); } /** @@ -5053,7 +5073,7 @@ static int mem_cgroup_do_precharge(unsigned long count) int ret; /* Try a single bulk charge without reclaim first, kswapd may wake */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count, false); if (!ret) { mc.precharge += count; return ret; @@ -5061,7 +5081,7 @@ static int mem_cgroup_do_precharge(unsigned long count) /* Try charges one by one with reclaim, but do not retry */ while (count--) { - ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); + ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1, false); if (ret) return ret; mc.precharge++; @@ -6255,7 +6275,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, if (!memcg) memcg = get_mem_cgroup_from_mm(mm); - ret = try_charge(memcg, gfp_mask, nr_pages); + ret = try_charge(memcg, gfp_mask, nr_pages, false); css_put(&memcg->css); out: @@ -6634,10 +6654,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); - if (try_charge(memcg, gfp_mask, nr_pages) == 0) + if (try_charge(memcg, gfp_mask, nr_pages, false) == 0) return true; - try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); + try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages, false); return false; } -- 2.21.0