[PATCH 2/2] loop: charge i/o per cgroup

Dan Schatzberg <dschatzberg@xxxxxx> · Wed, 5 Feb 2020 14:33:48 -0800

Naively charging loopback i/o to the original issuing blkcg could
result in priority inversion because all loopback writes go through a
single kworker thread.

This patch modifies the per-loop-device implementation to have a
workqueue and a tree of blkcg -> struct loop_worker (which has a queue
of struct loop_cmd). This allows individual blkcgs to make forward
progress when one is blocked due to resource control. There is also a
single rootcg loop_cmd queue.

The locking here is a bit heavy handed - any time the tree or a queue is
accessed, we take the spinlock. However, the previous implementation
serialized everything through a single kworker so I don't think this is
any worse.

This code previously took references on the cgroups (css_put), but I
don't believe this is necessary since the bio_vec pins the cgroup until
the end.

Signed-off-by: Dan Schatzberg <dschatzberg@xxxxxx>
---
 drivers/block/loop.c   | 179 ++++++++++++++++++++++++++++++++---------
 drivers/block/loop.h   |  11 +--
 kernel/cgroup/cgroup.c |   1 +
 3 files changed, 150 insertions(+), 41 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 739b372a5112..d0484eeecc9b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -70,7 +70,6 @@
 #include <linux/writeback.h>
 #include <linux/completion.h>
 #include <linux/highmem.h>
-#include <linux/kthread.h>
 #include <linux/splice.h>
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
@@ -78,6 +77,7 @@
 #include <linux/uio.h>
 #include <linux/ioprio.h>
 #include <linux/blk-cgroup.h>
+#include <linux/sched/mm.h>
 
 #include "loop.h"
 
@@ -503,8 +503,6 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
-	if (cmd->css)
-		css_put(cmd->css);
 	cmd->ret = ret;
 	lo_rw_aio_do_completion(cmd);
 }
@@ -565,8 +563,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_complete = lo_rw_aio_complete;
 	cmd->iocb.ki_flags = IOCB_DIRECT;
 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
-	if (cmd->css)
-		kthread_associate_blkcg(cmd->css);
 
 	if (rw == WRITE)
 		ret = call_write_iter(file, &cmd->iocb, &iter);
@@ -574,7 +570,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
 	lo_rw_aio_do_completion(cmd);
-	kthread_associate_blkcg(NULL);
 
 	if (ret != -EIOCBQUEUED)
 		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
@@ -891,27 +886,95 @@ static void loop_config_discard(struct loop_device *lo)
 
 static void loop_unprepare_queue(struct loop_device *lo)
 {
-	kthread_flush_worker(&lo->worker);
-	kthread_stop(lo->worker_task);
-}
-
-static int loop_kthread_worker_fn(void *worker_ptr)
-{
-	current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
-	return kthread_worker_fn(worker_ptr);
+	flush_workqueue(lo->workqueue);
+	destroy_workqueue(lo->workqueue);
 }
 
 static int loop_prepare_queue(struct loop_device *lo)
 {
-	kthread_init_worker(&lo->worker);
-	lo->worker_task = kthread_run(loop_kthread_worker_fn,
-			&lo->worker, "loop%d", lo->lo_number);
-	if (IS_ERR(lo->worker_task))
+	lo->workqueue = alloc_workqueue("loop%d",
+					WQ_FREEZABLE | WQ_MEM_RECLAIM |
+					WQ_HIGHPRI,
+					lo->lo_number);
+	if (IS_ERR(lo->workqueue))
 		return -ENOMEM;
-	set_user_nice(lo->worker_task, MIN_NICE);
+
 	return 0;
 }
 
+struct loop_worker {
+	struct rb_node rb_node;
+	struct work_struct work;
+	struct list_head cmd_list;
+	struct loop_device *lo;
+	int css_id;
+};
+
+static void loop_workfn(struct work_struct *work);
+static void loop_rootcg_workfn(struct work_struct *work);
+
+static void loop_queue_on_rootcg_locked(struct loop_device *lo,
+					struct loop_cmd *cmd)
+{
+	list_add_tail(&cmd->list_entry, &lo->rootcg_cmd_list);
+	if (list_is_first(&cmd->list_entry, &lo->rootcg_cmd_list))
+		queue_work(lo->workqueue, &lo->rootcg_work);
+}
+
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
+{
+	struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
+	struct loop_worker *cur_worker, *worker = NULL;
+	int css_id = 0;
+
+	if (cmd->blk_css)
+		css_id = cmd->blk_css->id;
+
+	spin_lock_irq(&lo->lo_lock);
+
+	if (!css_id) {
+		loop_queue_on_rootcg_locked(lo, cmd);
+		goto out;
+	}
+	node = &(lo->worker_tree.rb_node);
+
+	while (*node) {
+		parent = *node;
+		cur_worker = container_of(*node, struct loop_worker, rb_node);
+		if (cur_worker->css_id == css_id) {
+			worker = cur_worker;
+			break;
+		} else if (cur_worker->css_id < 0) {
+			node = &((*node)->rb_left);
+		} else {
+			node = &((*node)->rb_right);
+		}
+	}
+	if (worker) {
+		list_add_tail(&cmd->list_entry, &worker->cmd_list);
+	} else {
+		worker = kmalloc(sizeof(struct loop_worker), GFP_NOIO);
+		/*
+		 * In the event we cannot allocate a worker, just
+		 * queue on the rootcg worker
+		 */
+		if (!worker) {
+			loop_queue_on_rootcg_locked(lo, cmd);
+			goto out;
+		}
+		worker->css_id = css_id;
+		INIT_WORK(&worker->work, loop_workfn);
+		INIT_LIST_HEAD(&worker->cmd_list);
+		worker->lo = lo;
+		rb_link_node(&worker->rb_node, parent, node);
+		rb_insert_color(&worker->rb_node, &lo->worker_tree);
+		list_add_tail(&cmd->list_entry, &worker->cmd_list);
+		queue_work(lo->workqueue, &worker->work);
+	}
+out:
+	spin_unlock_irq(&lo->lo_lock);
+}
+
 static void loop_update_rotational(struct loop_device *lo)
 {
 	struct file *file = lo->lo_backing_file;
@@ -993,6 +1056,8 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
+	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
+	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
 	lo->use_dio = false;
 	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
@@ -1925,14 +1990,13 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	/* always use the first bio's css */
+	cmd->blk_css = NULL;
 #ifdef CONFIG_BLK_CGROUP
-	if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
-		cmd->css = &bio_blkcg(rq->bio)->css;
-		css_get(cmd->css);
-	} else
+	if (rq->bio && rq->bio->bi_blkg)
+		cmd->blk_css = &bio_blkcg(rq->bio)->css;
 #endif
-		cmd->css = NULL;
-	kthread_queue_work(&lo->worker, &cmd->work);
+
+	loop_queue_work(lo, cmd);
 
 	return BLK_STS_OK;
 }
@@ -1942,6 +2006,9 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
 	const bool write = op_is_write(req_op(rq));
 	struct loop_device *lo = rq->q->queuedata;
+#ifdef CONFIG_MEMCG
+	struct cgroup_subsys_state *mem_css;
+#endif
 	int ret = 0;
 
 	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
@@ -1949,8 +2016,24 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 		goto failed;
 	}
 
+	if (cmd->blk_css) {
+#ifdef CONFIG_MEMCG
+		mem_css = cgroup_get_e_css(cmd->blk_css->cgroup,
+					&memory_cgrp_subsys);
+		memalloc_use_memcg(mem_cgroup_from_css(mem_css));
+#endif
+		kthread_associate_blkcg(cmd->blk_css);
+	}
+
 	ret = do_req_filebacked(lo, rq);
- failed:
+
+	if (cmd->blk_css) {
+		kthread_associate_blkcg(NULL);
+#ifdef CONFIG_MEMCG
+		memalloc_unuse_memcg();
+#endif
+	}
+failed:
 	/* complete non-aio request */
 	if (!cmd->use_aio || ret) {
 		cmd->ret = ret ? -EIO : 0;
@@ -1958,26 +2041,50 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	}
 }
 
-static void loop_queue_work(struct kthread_work *work)
+static void loop_process_work(struct loop_worker *worker,
+			struct list_head *cmd_list, struct loop_device *lo)
 {
-	struct loop_cmd *cmd =
-		container_of(work, struct loop_cmd, work);
+	int orig_flags = current->flags;
+	struct loop_cmd *cmd;
+
+	while (1) {
+		spin_lock_irq(&lo->lo_lock);
+		if (list_empty(cmd_list)) {
+			if (worker)
+				rb_erase(&worker->rb_node, &lo->worker_tree);
+			spin_unlock_irq(&lo->lo_lock);
+			kfree(worker);
+			current->flags = orig_flags;
+			return;
+		}
 
-	loop_handle_cmd(cmd);
+		cmd = container_of(
+			cmd_list->next, struct loop_cmd, list_entry);
+		list_del(cmd_list->next);
+		spin_unlock_irq(&lo->lo_lock);
+		current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
+		loop_handle_cmd(cmd);
+		current->flags = orig_flags;
+		cond_resched();
+	}
 }
 
-static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
-		unsigned int hctx_idx, unsigned int numa_node)
+static void loop_workfn(struct work_struct *work)
 {
-	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct loop_worker *worker =
+		container_of(work, struct loop_worker, work);
+	loop_process_work(worker, &worker->cmd_list, worker->lo);
+}
 
-	kthread_init_work(&cmd->work, loop_queue_work);
-	return 0;
+static void loop_rootcg_workfn(struct work_struct *work)
+{
+	struct loop_device *lo =
+		container_of(work, struct loop_device, rootcg_work);
+	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
 }
 
 static const struct blk_mq_ops loop_mq_ops = {
 	.queue_rq       = loop_queue_rq,
-	.init_request	= loop_init_request,
 	.complete	= lo_complete_rq,
 };
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index af75a5ee4094..239d4921b441 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -14,7 +14,6 @@
 #include <linux/blk-mq.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
-#include <linux/kthread.h>
 #include <uapi/linux/loop.h>
 
 /* Possible states of device */
@@ -54,8 +53,10 @@ struct loop_device {
 
 	spinlock_t		lo_lock;
 	int			lo_state;
-	struct kthread_worker	worker;
-	struct task_struct	*worker_task;
+	struct workqueue_struct *workqueue;
+	struct work_struct      rootcg_work;
+	struct list_head        rootcg_cmd_list;
+	struct rb_root          worker_tree;
 	bool			use_dio;
 	bool			sysfs_inited;
 
@@ -65,13 +66,13 @@ struct loop_device {
 };
 
 struct loop_cmd {
-	struct kthread_work work;
+	struct list_head list_entry;
 	bool use_aio; /* use AIO interface to handle I/O */
 	atomic_t ref; /* only for aio */
 	long ret;
 	struct kiocb iocb;
 	struct bio_vec *bvec;
-	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state *blk_css;
 };
 
 /* Support for loadable transfer modules */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..dd8270c6191c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -587,6 +587,7 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 	rcu_read_unlock();
 	return css;
 }
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
 
 static void cgroup_get_live(struct cgroup *cgrp)
 {
-- 
2.17.1