In codepaths that use the begin/retry interface for reading mems_allowed_seq with irqs disabled, there exists a race condition that stalls the patch process after only modifying a subset of the static_branch call sites. This problem manifested itself as a dead lock in the slub allocator, inside get_any_partial. The loop reads mems_allowed_seq value (via read_mems_allowed_begin), performs the defrag operation, and then verifies the consistency of mem_allowed via the read_mems_allowed_retry and the cookie returned by xxx_begin. The issue here is that both begin and retry first check if cpusets are enabled via cpusets_enabled() static branch. This branch can be rewritted dynamically (via cpuset_inc) if a new cpuset is created. The x86 jump label code fully synchronizes across all CPUs for every entry it rewrites. If it rewrites only one of the callsites (specifically the one in read_mems_allowed_retry) and then waits for the smp_call_function(do_sync_core) to complete while a CPU is inside the begin/retry section with IRQs off and the mems_allowed value is changed, we can hang. This is because begin() will always return 0 (since it wasn't patched yet) while retry() will test the 0 against the actual value of the seq counter. The fix is to cache the value that's returned by cpusets_enabled() at the top of the loop, and only operate on the seqcount (both begin and retry) if it was true. The relevant stack traces of the two stuck threads: CPU: 107 PID: 1415 Comm: mkdir Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8817f9c28000 task.stack: ffffc9000ffa4000 RIP: smp_call_function_many+0x1f9/0x260 Call Trace: ? setup_data_read+0xa0/0xa0 ? ___slab_alloc+0x28b/0x5a0 smp_call_function+0x3b/0x70 ? setup_data_read+0xa0/0xa0 on_each_cpu+0x2f/0x90 ? ___slab_alloc+0x28a/0x5a0 ? ___slab_alloc+0x28b/0x5a0 text_poke_bp+0x87/0xd0 ? ___slab_alloc+0x28a/0x5a0 arch_jump_label_transform+0x93/0x100 __jump_label_update+0x77/0x90 jump_label_update+0xaa/0xc0 static_key_slow_inc+0x9e/0xb0 cpuset_css_online+0x70/0x2e0 online_css+0x2c/0xa0 cgroup_apply_control_enable+0x27f/0x3d0 cgroup_mkdir+0x2b7/0x420 kernfs_iop_mkdir+0x5a/0x80 vfs_mkdir+0xf6/0x1a0 SyS_mkdir+0xb7/0xe0 entry_SYSCALL_64_fastpath+0x18/0xad ... CPU: 22 PID: 1 Comm: init Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8818087c0000 task.stack: ffffc90000030000 RIP: int3+0x39/0x70 Call Trace: <#DB> ? ___slab_alloc+0x28b/0x5a0 <EOE> ? copy_process.part.40+0xf7/0x1de0 ? __slab_alloc.isra.80+0x54/0x90 ? copy_process.part.40+0xf7/0x1de0 ? copy_process.part.40+0xf7/0x1de0 ? kmem_cache_alloc_node+0x8a/0x280 ? copy_process.part.40+0xf7/0x1de0 ? _do_fork+0xe7/0x6c0 ? _raw_spin_unlock_irq+0x2d/0x60 ? trace_hardirqs_on_caller+0x136/0x1d0 ? entry_SYSCALL_64_fastpath+0x5/0xad ? do_syscall_64+0x27/0x350 ? SyS_clone+0x19/0x20 ? do_syscall_64+0x60/0x350 ? entry_SYSCALL64_slow_path+0x25/0x25 Reported-by: Cliff Spradlin <cspradlin@xxxxxxxxx> Signed-off-by: Dima Zavin <dmitriyz@xxxxxxxxx> --- v2: - Moved the cached cpusets_enabled() state into the cookie, turned the cookie into a struct and updated all the other call sites. - Applied on top of v4.12 since one of the callers in page_alloc.c changed. Still only tested on v4.9.36 and compile tested against v4.12. include/linux/cpuset.h | 27 +++++++++++++++++---------- mm/filemap.c | 6 +++--- mm/hugetlb.c | 12 ++++++------ mm/mempolicy.c | 12 ++++++------ mm/page_alloc.c | 8 ++++---- mm/slab.c | 6 +++--- mm/slub.c | 6 +++--- 7 files changed, 42 insertions(+), 35 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 119a3f9604b0..f64f6d3b1dce 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -16,6 +16,11 @@ #include <linux/mm.h> #include <linux/jump_label.h> +struct cpuset_mems_cookie { + unsigned int seq; + bool was_enabled; +}; + #ifdef CONFIG_CPUSETS extern struct static_key_false cpusets_enabled_key; @@ -113,12 +118,15 @@ extern void cpuset_print_current_mems_allowed(void); * causing process failure. A retry loop with read_mems_allowed_begin and * read_mems_allowed_retry prevents these artificial failures. */ -static inline unsigned int read_mems_allowed_begin(void) +static inline void read_mems_allowed_begin(struct cpuset_mems_cookie *cookie) { - if (!cpusets_enabled()) - return 0; + if (!cpusets_enabled()) { + cookie->was_enabled = false; + return; + } - return read_seqcount_begin(¤t->mems_allowed_seq); + cookie->was_enabled = true; + cookie->seq = read_seqcount_begin(¤t->mems_allowed_seq); } /* @@ -127,12 +135,11 @@ static inline unsigned int read_mems_allowed_begin(void) * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ -static inline bool read_mems_allowed_retry(unsigned int seq) +static inline bool read_mems_allowed_retry(struct cpuset_mems_cookie *cookie) { - if (!cpusets_enabled()) + if (!cookie->was_enabled) return false; - - return read_seqcount_retry(¤t->mems_allowed_seq, seq); + return read_seqcount_retry(¤t->mems_allowed_seq, cookie->seq); } static inline void set_mems_allowed(nodemask_t nodemask) @@ -249,12 +256,12 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } -static inline unsigned int read_mems_allowed_begin(void) +static inline void read_mems_allowed_begin(struct cpuset_mems_cookie *cookie) { return 0; } -static inline bool read_mems_allowed_retry(unsigned int seq) +static inline bool read_mems_allowed_retry(struct cpuset_mems_cookie *cookie) { return false; } diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..c0730b377519 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -716,12 +716,12 @@ struct page *__page_cache_alloc(gfp_t gfp) struct page *page; if (cpuset_do_page_mem_spread()) { - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; do { - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); n = cpuset_mem_spread_node(); page = __alloc_pages_node(n, gfp, 0); - } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); + } while (!page && read_mems_allowed_retry(&cpuset_mems_cookie)); return page; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3eedb187e549..1defa44f4fe6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -907,7 +907,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; /* * A child process with MAP_PRIVATE mappings created by their parent @@ -923,7 +923,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, goto err; retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); zonelist = huge_zonelist(vma, address, htlb_alloc_mask(h), &mpol, &nodemask); @@ -945,7 +945,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } mpol_cond_put(mpol); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(&cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -1511,7 +1511,7 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, { int order = huge_page_order(h); gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; /* * We need a VMA to get a memory policy. If we do not @@ -1548,13 +1548,13 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, struct zonelist *zl; nodemask_t *nodemask; - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); mpol_cond_put(mpol); page = __alloc_pages_nodemask(gfp, order, zl, nodemask); if (page) return page; - } while (read_mems_allowed_retry(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(&cpuset_mems_cookie)); return NULL; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37d0b334bfe9..b4f2513a2296 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1971,13 +1971,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, { struct mempolicy *pol; struct page *page; - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; struct zonelist *zl; nodemask_t *nmask; retry_cpuset: pol = get_vma_policy(vma, addr); - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; @@ -2019,7 +2019,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, page = __alloc_pages_nodemask(gfp, order, zl, nmask); mpol_cond_put(pol); out: - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(&cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2047,13 +2047,13 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); /* * No reference counting needed for current->mempolicy @@ -2066,7 +2066,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(&cpuset_mems_cookie))) goto retry_cpuset; return page; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2302f250d6b1..36cd4e95fb38 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3688,7 +3688,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int no_progress_loops; unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3713,7 +3713,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); /* * The fast path uses conservative alloc_flags to succeed only until @@ -3872,7 +3872,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * It's possible we raced with cpuset update so the OOM would be * premature (see below the nopage: label for full explanation). */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + if (read_mems_allowed_retry(&cpuset_mems_cookie)) goto retry_cpuset; /* Reclaim has failed us, start killing things */ @@ -3900,7 +3900,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * to fail, check if the cpuset changed during allocation and if so, * retry. */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + if (read_mems_allowed_retry(&cpuset_mems_cookie)) goto retry_cpuset; /* diff --git a/mm/slab.c b/mm/slab.c index 2a31ee3c5814..391fe9d9d24e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3195,13 +3195,13 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) void *obj = NULL; struct page *page; int nid; - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; if (flags & __GFP_THISNODE) return NULL; retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); zonelist = node_zonelist(mempolicy_slab_node(), flags); retry: @@ -3245,7 +3245,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) } } - if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!obj && read_mems_allowed_retry(&cpuset_mems_cookie))) goto retry_cpuset; return obj; } diff --git a/mm/slub.c b/mm/slub.c index 8addc535bcdc..55c4862852ec 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1849,7 +1849,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *object; - unsigned int cpuset_mems_cookie; + struct cpuset_mems_cookie cpuset_mems_cookie; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1874,7 +1874,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = read_mems_allowed_begin(); + read_mems_allowed_begin(&cpuset_mems_cookie); zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1896,7 +1896,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, } } } - } while (read_mems_allowed_retry(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(&cpuset_mems_cookie)); #endif return NULL; } -- 2.14.0.rc0.400.g1c36432dff-goog -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html