Naively charging loopback i/o to the original issuing blkcg could result in priority inversion because all loopback writes go through a single kworker thread. This patch modifies the per-loop-device implementation to have a workqueue and a tree of blkcg -> struct loop_worker (which has a queue of struct loop_cmd). This allows individual blkcgs to make forward progress when one is blocked due to resource control. There is also a single rootcg loop_cmd queue. The locking here is a bit heavy handed - any time the tree or a queue is accessed, we take the spinlock. However, the previous implementation serialized everything through a single kworker so I don't think this is any worse. This code previously took references on the cgroups (css_put), but I don't believe this is necessary since the bio_vec pins the cgroup until the end. Signed-off-by: Dan Schatzberg <dschatzberg@xxxxxx> --- drivers/block/loop.c | 179 ++++++++++++++++++++++++++++++++--------- drivers/block/loop.h | 11 +-- kernel/cgroup/cgroup.c | 1 + 3 files changed, 150 insertions(+), 41 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 739b372a5112..d0484eeecc9b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -70,7 +70,6 @@ #include <linux/writeback.h> #include <linux/completion.h> #include <linux/highmem.h> -#include <linux/kthread.h> #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> @@ -78,6 +77,7 @@ #include <linux/uio.h> #include <linux/ioprio.h> #include <linux/blk-cgroup.h> +#include <linux/sched/mm.h> #include "loop.h" @@ -503,8 +503,6 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); - if (cmd->css) - css_put(cmd->css); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } @@ -565,8 +563,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); - if (cmd->css) - kthread_associate_blkcg(cmd->css); if (rw == WRITE) ret = call_write_iter(file, &cmd->iocb, &iter); @@ -574,7 +570,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, ret = call_read_iter(file, &cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); - kthread_associate_blkcg(NULL); if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); @@ -891,27 +886,95 @@ static void loop_config_discard(struct loop_device *lo) static void loop_unprepare_queue(struct loop_device *lo) { - kthread_flush_worker(&lo->worker); - kthread_stop(lo->worker_task); -} - -static int loop_kthread_worker_fn(void *worker_ptr) -{ - current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; - return kthread_worker_fn(worker_ptr); + flush_workqueue(lo->workqueue); + destroy_workqueue(lo->workqueue); } static int loop_prepare_queue(struct loop_device *lo) { - kthread_init_worker(&lo->worker); - lo->worker_task = kthread_run(loop_kthread_worker_fn, - &lo->worker, "loop%d", lo->lo_number); - if (IS_ERR(lo->worker_task)) + lo->workqueue = alloc_workqueue("loop%d", + WQ_FREEZABLE | WQ_MEM_RECLAIM | + WQ_HIGHPRI, + lo->lo_number); + if (IS_ERR(lo->workqueue)) return -ENOMEM; - set_user_nice(lo->worker_task, MIN_NICE); + return 0; } +struct loop_worker { + struct rb_node rb_node; + struct work_struct work; + struct list_head cmd_list; + struct loop_device *lo; + int css_id; +}; + +static void loop_workfn(struct work_struct *work); +static void loop_rootcg_workfn(struct work_struct *work); + +static void loop_queue_on_rootcg_locked(struct loop_device *lo, + struct loop_cmd *cmd) +{ + list_add_tail(&cmd->list_entry, &lo->rootcg_cmd_list); + if (list_is_first(&cmd->list_entry, &lo->rootcg_cmd_list)) + queue_work(lo->workqueue, &lo->rootcg_work); +} + +static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) +{ + struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL; + struct loop_worker *cur_worker, *worker = NULL; + int css_id = 0; + + if (cmd->blk_css) + css_id = cmd->blk_css->id; + + spin_lock_irq(&lo->lo_lock); + + if (!css_id) { + loop_queue_on_rootcg_locked(lo, cmd); + goto out; + } + node = &(lo->worker_tree.rb_node); + + while (*node) { + parent = *node; + cur_worker = container_of(*node, struct loop_worker, rb_node); + if (cur_worker->css_id == css_id) { + worker = cur_worker; + break; + } else if (cur_worker->css_id < 0) { + node = &((*node)->rb_left); + } else { + node = &((*node)->rb_right); + } + } + if (worker) { + list_add_tail(&cmd->list_entry, &worker->cmd_list); + } else { + worker = kmalloc(sizeof(struct loop_worker), GFP_NOIO); + /* + * In the event we cannot allocate a worker, just + * queue on the rootcg worker + */ + if (!worker) { + loop_queue_on_rootcg_locked(lo, cmd); + goto out; + } + worker->css_id = css_id; + INIT_WORK(&worker->work, loop_workfn); + INIT_LIST_HEAD(&worker->cmd_list); + worker->lo = lo; + rb_link_node(&worker->rb_node, parent, node); + rb_insert_color(&worker->rb_node, &lo->worker_tree); + list_add_tail(&cmd->list_entry, &worker->cmd_list); + queue_work(lo->workqueue, &worker->work); + } +out: + spin_unlock_irq(&lo->lo_lock); +} + static void loop_update_rotational(struct loop_device *lo) { struct file *file = lo->lo_backing_file; @@ -993,6 +1056,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); lo->use_dio = false; lo->lo_device = bdev; lo->lo_flags = lo_flags; @@ -1925,14 +1990,13 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, } /* always use the first bio's css */ + cmd->blk_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { - cmd->css = &bio_blkcg(rq->bio)->css; - css_get(cmd->css); - } else + if (rq->bio && rq->bio->bi_blkg) + cmd->blk_css = &bio_blkcg(rq->bio)->css; #endif - cmd->css = NULL; - kthread_queue_work(&lo->worker, &cmd->work); + + loop_queue_work(lo, cmd); return BLK_STS_OK; } @@ -1942,6 +2006,9 @@ static void loop_handle_cmd(struct loop_cmd *cmd) struct request *rq = blk_mq_rq_from_pdu(cmd); const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; +#ifdef CONFIG_MEMCG + struct cgroup_subsys_state *mem_css; +#endif int ret = 0; if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { @@ -1949,8 +2016,24 @@ static void loop_handle_cmd(struct loop_cmd *cmd) goto failed; } + if (cmd->blk_css) { +#ifdef CONFIG_MEMCG + mem_css = cgroup_get_e_css(cmd->blk_css->cgroup, + &memory_cgrp_subsys); + memalloc_use_memcg(mem_cgroup_from_css(mem_css)); +#endif + kthread_associate_blkcg(cmd->blk_css); + } + ret = do_req_filebacked(lo, rq); - failed: + + if (cmd->blk_css) { + kthread_associate_blkcg(NULL); +#ifdef CONFIG_MEMCG + memalloc_unuse_memcg(); +#endif + } +failed: /* complete non-aio request */ if (!cmd->use_aio || ret) { cmd->ret = ret ? -EIO : 0; @@ -1958,26 +2041,50 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_queue_work(struct kthread_work *work) +static void loop_process_work(struct loop_worker *worker, + struct list_head *cmd_list, struct loop_device *lo) { - struct loop_cmd *cmd = - container_of(work, struct loop_cmd, work); + int orig_flags = current->flags; + struct loop_cmd *cmd; + + while (1) { + spin_lock_irq(&lo->lo_lock); + if (list_empty(cmd_list)) { + if (worker) + rb_erase(&worker->rb_node, &lo->worker_tree); + spin_unlock_irq(&lo->lo_lock); + kfree(worker); + current->flags = orig_flags; + return; + } - loop_handle_cmd(cmd); + cmd = container_of( + cmd_list->next, struct loop_cmd, list_entry); + list_del(cmd_list->next); + spin_unlock_irq(&lo->lo_lock); + current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; + loop_handle_cmd(cmd); + current->flags = orig_flags; + cond_resched(); + } } -static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) +static void loop_workfn(struct work_struct *work) { - struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct loop_worker *worker = + container_of(work, struct loop_worker, work); + loop_process_work(worker, &worker->cmd_list, worker->lo); +} - kthread_init_work(&cmd->work, loop_queue_work); - return 0; +static void loop_rootcg_workfn(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, rootcg_work); + loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, - .init_request = loop_init_request, .complete = lo_complete_rq, }; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index af75a5ee4094..239d4921b441 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -14,7 +14,6 @@ #include <linux/blk-mq.h> #include <linux/spinlock.h> #include <linux/mutex.h> -#include <linux/kthread.h> #include <uapi/linux/loop.h> /* Possible states of device */ @@ -54,8 +53,10 @@ struct loop_device { spinlock_t lo_lock; int lo_state; - struct kthread_worker worker; - struct task_struct *worker_task; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct rb_root worker_tree; bool use_dio; bool sysfs_inited; @@ -65,13 +66,13 @@ struct loop_device { }; struct loop_cmd { - struct kthread_work work; + struct list_head list_entry; bool use_aio; /* use AIO interface to handle I/O */ atomic_t ref; /* only for aio */ long ret; struct kiocb iocb; struct bio_vec *bvec; - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *blk_css; }; /* Support for loadable transfer modules */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..dd8270c6191c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -587,6 +587,7 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, rcu_read_unlock(); return css; } +EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { -- 2.17.1