For drivers/modules running inside a memalloc_flags_{save,restore} region, if a work-queue is created, we make sure work executed on the work-queue inherits the same flag(s). This in order to conditionally enable drivers to work aligned with block I/O devices. This commit makes sure that any work queued later on work-queues created during module initialization, when current's flags has any of the PF_MEMALLOC* set, will inherit the same flags. We do this in order to enable drivers to be used as a network block I/O device. This in order to support XFS or other file-systems on top of a raw block device which uses said drivers as the network transport layer. Under intense memory pressure, we get memory reclaims. Assume the file-system reclaims memory, goes to the raw block device, which calls into said drivers. Now, if regular GFP_KERNEL allocations in the drivers require reclaims to be fulfilled, we end up in a circular dependency. We break this circular dependency by: 1. Force all allocations in the drivers to use GFP_NOIO, by means of a parenthetic use of memalloc_flags_{save,restore} on all relevant entry points, setting/clearing the PF_MEMALLOC_NOIO bit. 2. Make sure work-queues inherits current->flags wrt. PF_MEMALLOC_NOIO, such that work executed on the work-queue inherits the same flag(s). That is what this commit contributes with. Signed-off-by: Håkon Bugge <haakon.bugge@xxxxxxxxxx> --- v2 -> v3: * Add support for all PF_MEMALLOC* flags * Re-worded commit message v1 -> v2: * Added missing hunk in alloc_workqueue() --- include/linux/workqueue.h | 9 ++++++ kernel/workqueue.c | 60 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fb39938945365..f8c87f824272b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -406,9 +406,18 @@ enum wq_flags { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ + __WQ_MEMALLOC = 1 << 19, /* internal: execute work with MEMALLOC */ + __WQ_MEMALLOC_NOFS = 1 << 20, /* internal: execute work with MEMALLOC_NOFS */ + __WQ_MEMALLOC_NOIO = 1 << 21, /* internal: execute work with MEMALLOC_NOIO */ + __WQ_MEMALLOC_NORECLAIM = 1 << 22, /* internal: execute work with MEMALLOC_NORECLAIM */ + __WQ_MEMALLOC_NOWARN = 1 << 23, /* internal: execute work with MEMALLOC_NOWARN */ + __WQ_MEMALLOC_PIN = 1 << 24, /* internal: execute work with MEMALLOC_PIN */ /* BH wq only allows the following flags */ __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, + + __WQ_PF_MEMALLOC_MASK = PF_MEMALLOC | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOIO | + PF_MEMALLOC_NORECLAIM | PF_MEMALLOC_NOWARN | PF_MEMALLOC_PIN, }; enum wq_consts { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 003474c9a77d0..28ed6b9556e91 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -51,6 +51,7 @@ #include <linux/uaccess.h> #include <linux/sched/isolation.h> #include <linux/sched/debug.h> +#include <linux/sched/mm.h> #include <linux/nmi.h> #include <linux/kvm_para.h> #include <linux/delay.h> @@ -3113,6 +3114,28 @@ static bool manage_workers(struct worker *worker) return true; } +static unsigned int wq_build_memalloc_flags(struct pool_workqueue *pwq) +{ + unsigned int pf_flags = 0; + +#define BUILD_PF_FLAGS_FROM_WQ(name) \ + do { \ + if (pwq->wq->flags & __WQ_ ## name) \ + pf_flags |= PF_ ## name; \ + } while (0) + + BUILD_PF_FLAGS_FROM_WQ(MEMALLOC); + BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NOFS); + BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NOIO); + BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NORECLAIM); + BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_NOWARN); + BUILD_PF_FLAGS_FROM_WQ(MEMALLOC_PIN); + +#undef BUILD_PF_FLAGS_FROM_WQ + + return pf_flags; +} + /** * process_one_work - process single work * @worker: self @@ -3136,6 +3159,8 @@ __acquires(&pool->lock) unsigned long work_data; int lockdep_start_depth, rcu_start_depth; bool bh_draining = pool->flags & POOL_BH_DRAINING; + unsigned int memalloc_flags = wq_build_memalloc_flags(pwq); + unsigned int memalloc_flags_old; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from @@ -3148,6 +3173,10 @@ __acquires(&pool->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif + /* Set inherited alloc flags */ + if (memalloc_flags) + memalloc_flags_old = memalloc_flags_save(memalloc_flags); + /* ensure we're on the correct CPU */ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); @@ -3284,6 +3313,10 @@ __acquires(&pool->lock) /* must be the last step, see the function comment */ pwq_dec_nr_in_flight(pwq, work_data); + + /* Restore alloc flags */ + if (memalloc_flags) + memalloc_flags_restore(memalloc_flags_old); } /** @@ -5637,6 +5670,30 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } +/** + * wq_set_memalloc_flags - Test current->flags for PF_MEMALLOC_FOO_BAR + * flag bits and set the corresponding __WQ_MEMALLOC_FOO_BAR in the + * WQ's flags variable. + * @flags_ptr: Pointer to wq->flags + */ +static void wq_set_memalloc_flags(unsigned int *flags_ptr) +{ +#define TEST_PF_SET_WQ(name) \ + do { \ + if (current->flags & PF_ ## name) \ + *flags_ptr |= __WQ_ ## name; \ + } while (0) + + TEST_PF_SET_WQ(MEMALLOC); + TEST_PF_SET_WQ(MEMALLOC_NOFS); + TEST_PF_SET_WQ(MEMALLOC_NOIO); + TEST_PF_SET_WQ(MEMALLOC_NORECLAIM); + TEST_PF_SET_WQ(MEMALLOC_NOWARN); + TEST_PF_SET_WQ(MEMALLOC_PIN); + +#undef TEST_PF_SET_WQ +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, @@ -5695,6 +5752,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, /* init wq */ wq->flags = flags; + if (current->flags & __WQ_PF_MEMALLOC_MASK) + wq_set_memalloc_flags(&wq->flags); + wq->max_active = max_active; wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); wq->saved_max_active = wq->max_active; -- 2.31.1