[PATCH 10/10] blkcg: implement BPF_PROG_TYPE_IO_COST

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently, blkcg implements one builtin IO cost model - lienar.  To
allow customization and experimentation, allow a bpf program to
override IO cost model.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
---
 block/Kconfig                                 |   3 +
 block/blk-ioweight.c                          | 148 +++++++++++++++++-
 block/blk.h                                   |   8 +
 block/ioctl.c                                 |   4 +
 include/linux/bpf_types.h                     |   3 +
 include/uapi/linux/bpf.h                      |  11 ++
 include/uapi/linux/fs.h                       |   2 +
 tools/bpf/bpftool/feature.c                   |   3 +
 tools/bpf/bpftool/main.h                      |   1 +
 tools/include/uapi/linux/bpf.h                |  11 ++
 tools/include/uapi/linux/fs.h                 |   2 +
 tools/lib/bpf/libbpf.c                        |   2 +
 tools/lib/bpf/libbpf_probes.c                 |   1 +
 tools/testing/selftests/bpf/Makefile          |   2 +-
 tools/testing/selftests/bpf/iocost_ctrl.c     |  43 +++++
 .../selftests/bpf/progs/iocost_linear_prog.c  |  52 ++++++
 16 files changed, 287 insertions(+), 9 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/iocost_ctrl.c
 create mode 100644 tools/testing/selftests/bpf/progs/iocost_linear_prog.c

diff --git a/block/Kconfig b/block/Kconfig
index 15b3de28a264..2882fdd573ca 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -204,4 +204,7 @@ config BLK_MQ_RDMA
 config BLK_PM
 	def_bool BLOCK && PM
 
+config BLK_BPF_IO_COST
+	def_bool BLK_CGROUP_IOWEIGHT && BPF_SYSCALL
+
 source "block/Kconfig.iosched"
diff --git a/block/blk-ioweight.c b/block/blk-ioweight.c
index 3d9fc1a631be..de4fc57bb77c 100644
--- a/block/blk-ioweight.c
+++ b/block/blk-ioweight.c
@@ -43,6 +43,10 @@
  * parameters can be configured from userspace via
  * /sys/block/DEV/queue/io_cost_model.
  *
+ * For experimentations and refinements, the IO model can also be replaced
+ * by a IO_COST bpf program.  Take a look at progs/iocost_linear_prog.c and
+ * iocost_ctrl.c under tools/testing/selftests/bpf for details on how-to.
+ *
  * 2. Control Strategy
  *
  * The device virtual time (vtime) is used as the primary control metric.
@@ -176,6 +180,7 @@
 #include <linux/parser.h>
 #include <linux/sched/signal.h>
 #include <linux/blk-cgroup.h>
+#include <linux/filter.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
@@ -387,6 +392,10 @@ struct iow {
 	bool				enabled;
 
 	struct iow_params		params;
+#ifdef CONFIG_BPF_SYSCALL
+	/* if non-NULL, bpf cost model is being used */
+	struct bpf_prog __rcu		*cost_prog;
+#endif
 	u32				period_us;
 	u32				margin_us;
 	u64				vrate_min;
@@ -1565,6 +1574,45 @@ static void iow_timer_fn(struct timer_list *timer)
 	spin_unlock_irq(&iow->lock);
 }
 
+#ifdef CONFIG_BLK_BPF_IO_COST
+static bool calc_vtime_cost_bpf(struct bio *bio, struct iow_gq *iowg,
+				bool is_merge, u64 *costp)
+{
+	struct iow *iow = iowg->iow;
+	struct bpf_prog *prog;
+	bool ret = false;
+
+	if (!iow->cost_prog)
+		return ret;
+
+	rcu_read_lock();
+	prog = rcu_dereference(iow->cost_prog);
+	if (prog) {
+		struct bpf_io_cost ctx = {
+			.cost = 0,
+			.opf = bio->bi_opf,
+			.nr_sectors = bio_sectors(bio),
+			.sector = bio->bi_iter.bi_sector,
+			.last_sector = iowg->cursor,
+			.is_merge = is_merge,
+		};
+
+		BPF_PROG_RUN(prog, &ctx);
+		*costp = ctx.cost;
+		ret = true;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+#else
+static bool calc_vtime_cost_bpf(struct bio *bio, struct iow_gq *iowg,
+				bool is_merge, u64 *costp)
+{
+	return false;
+}
+#endif
+
 static void calc_vtime_cost_builtin(struct bio *bio, struct iow_gq *iowg,
 				    bool is_merge, u64 *costp)
 {
@@ -1610,6 +1658,9 @@ static u64 calc_vtime_cost(struct bio *bio, struct iow_gq *iowg, bool is_merge)
 {
 	u64 cost;
 
+	if (calc_vtime_cost_bpf(bio, iowg, is_merge, &cost))
+		return cost;
+
 	calc_vtime_cost_builtin(bio, iowg, is_merge, &cost);
 	return cost;
 }
@@ -2214,14 +2265,17 @@ static u64 iow_cost_model_prfill(struct seq_file *sf,
 	if (!dname)
 		return 0;
 
-	seq_printf(sf, "%s ctrl=%s model=linear "
-		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
-		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
-		   dname, iow->user_cost_model ? "user" : "auto",
-		   u[I_LCOEF_RBPS],
-		   u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
-		   u[I_LCOEF_WBPS],
-		   u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
+	if (iow->cost_prog)
+		seq_printf(sf, "%s ctrl=bpf\n", dname);
+	else
+		seq_printf(sf, "%s ctrl=%s model=linear "
+			   "rbps=%llu rseqiops=%llu rrandiops=%llu "
+			   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
+			   dname, iow->user_cost_model ? "user" : "auto",
+			   u[I_LCOEF_RBPS],
+			   u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
+			   u[I_LCOEF_WBPS],
+			   u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
 	return 0;
 }
 
@@ -2363,6 +2417,84 @@ static struct blkcg_policy blkcg_policy_iow = {
 	.pd_free_fn	= iow_pd_free,
 };
 
+#ifdef CONFIG_BLK_BPF_IO_COST
+static bool io_cost_is_valid_access(int off, int size,
+				    enum bpf_access_type type,
+				    const struct bpf_prog *prog,
+				    struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= sizeof(struct bpf_io_cost) || off % size)
+		return false;
+
+	if (off != offsetof(struct bpf_io_cost, cost) && type != BPF_READ)
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_io_cost, opf):
+		bpf_ctx_record_field_size(info, sizeof(__u32));
+		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+	case offsetof(struct bpf_io_cost, nr_sectors):
+		return size == sizeof(__u32);
+	case offsetof(struct bpf_io_cost, cost):
+	case offsetof(struct bpf_io_cost, sector):
+	case offsetof(struct bpf_io_cost, last_sector):
+		return size == sizeof(__u64);
+	case offsetof(struct bpf_io_cost, is_merge):
+		return size == sizeof(__u8);
+	}
+
+	return false;
+}
+
+const struct bpf_prog_ops io_cost_prog_ops = {
+};
+
+const struct bpf_verifier_ops io_cost_verifier_ops = {
+	.is_valid_access	= io_cost_is_valid_access,
+};
+
+int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+			  char __user *arg)
+{
+	int prog_fd = (int)(long)arg;
+	struct bpf_prog *prog = NULL;
+	struct request_queue *q;
+	struct iow *iow;
+	int ret = 0;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+	iow = q_to_iow(q);
+
+	if (prog_fd >= 0) {
+		prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_IO_COST);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+
+		spin_lock_irq(&iow->lock);
+		if (!iow->cost_prog) {
+			rcu_assign_pointer(iow->cost_prog, prog);
+			prog = NULL;
+		} else {
+			ret = -EEXIST;
+		}
+		spin_unlock_irq(&iow->lock);
+	} else {
+		spin_lock_irq(&iow->lock);
+		if (iow->cost_prog) {
+			prog = iow->cost_prog;
+			rcu_assign_pointer(iow->cost_prog, NULL);
+		}
+		spin_unlock_irq(&iow->lock);
+	}
+
+	if (prog)
+		bpf_prog_put(prog);
+	return ret;
+}
+#endif /* CONFIG_BLK_BPF_IO_COST */
+
 static int __init iow_init(void)
 {
 	return blkcg_policy_register(&blkcg_policy_iow);
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..98fa2283534f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -317,6 +317,14 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 }
 #endif /* CONFIG_BOUNCE */
 
+#ifdef CONFIG_BLK_BPF_IO_COST
+int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+			  char __user *arg);
+#else
+static inline int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+					char __user *arg) { return -ENOTTY; }
+#endif
+
 #ifdef CONFIG_BLK_CGROUP_IOLATENCY
 extern int blk_iolatency_init(struct request_queue *q);
 #else
diff --git a/block/ioctl.c b/block/ioctl.c
index 15a0eb80ada9..89d48d7dea0f 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -11,6 +11,8 @@
 #include <linux/pr.h>
 #include <linux/uaccess.h>
 
+#include "blk.h"
+
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
 {
 	struct block_device *bdevp;
@@ -590,6 +592,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
 		return blk_trace_ioctl(bdev, cmd, argp);
+	case BLKBPFIOCOST:
+		return blk_bpf_io_cost_ioctl(bdev, cmd, argp);
 	case IOC_PR_REGISTER:
 		return blkdev_pr_register(bdev, argp);
 	case IOC_PR_RESERVE:
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5a9975678d6f..fb0a91c655c2 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,6 +37,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
 #endif
+#ifdef CONFIG_BLK_BPF_IO_COST
+BPF_PROG_TYPE(BPF_PROG_TYPE_IO_COST, io_cost)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63e0cf66f01a..1664ef4ccc79 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_FLOW_DISSECTOR,
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+	BPF_PROG_TYPE_IO_COST,
 };
 
 enum bpf_attach_type {
@@ -3472,6 +3473,16 @@ struct bpf_flow_keys {
 	};
 };
 
+struct bpf_io_cost {
+	__u64	cost;				/* output */
+
+	__u32	opf;
+	__u32	nr_sectors;
+	__u64	sector;
+	__u64	last_sector;
+	__u8	is_merge;
+};
+
 struct bpf_func_info {
 	__u32	insn_off;
 	__u32	type_id;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 59c71fa8c553..ddf3c80c9407 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -181,6 +181,8 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+#define BLKBPFIOCOST _IO(0x12, 128)
+
 /*
  * A jump here: 130-131 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index d672d9086fff..beeac8ac48f3 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -383,6 +383,9 @@ static void probe_kernel_image_config(void)
 		/* bpftilter module with "user mode helper" */
 		"CONFIG_BPFILTER_UMH",
 
+		/* Block */
+		"CONFIG_BLK_IO_COST",
+
 		/* test_bpf module for BPF tests */
 		"CONFIG_TEST_BPF",
 	};
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 3d63feb7f852..298e53f35573 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_SK_REUSEPORT]		= "sk_reuseport",
 	[BPF_PROG_TYPE_FLOW_DISSECTOR]		= "flow_dissector",
 	[BPF_PROG_TYPE_CGROUP_SYSCTL]		= "cgroup_sysctl",
+	[BPF_PROG_TYPE_IO_COST]			= "io_cost",
 };
 
 extern const char * const map_type_name[];
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63e0cf66f01a..1664ef4ccc79 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_FLOW_DISSECTOR,
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+	BPF_PROG_TYPE_IO_COST,
 };
 
 enum bpf_attach_type {
@@ -3472,6 +3473,16 @@ struct bpf_flow_keys {
 	};
 };
 
+struct bpf_io_cost {
+	__u64	cost;				/* output */
+
+	__u32	opf;
+	__u32	nr_sectors;
+	__u64	sector;
+	__u64	last_sector;
+	__u8	is_merge;
+};
+
 struct bpf_func_info {
 	__u32	insn_off;
 	__u32	type_id;
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index 59c71fa8c553..ddf3c80c9407 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -181,6 +181,8 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+#define BLKBPFIOCOST _IO(0x12, 128)
+
 /*
  * A jump here: 130-131 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 197b574406b3..6dbee409f3b0 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2266,6 +2266,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_IO_COST:
 		return false;
 	case BPF_PROG_TYPE_KPROBE:
 	default:
@@ -3168,6 +3169,7 @@ static const struct {
 	BPF_PROG_SEC("lwt_out",			BPF_PROG_TYPE_LWT_OUT),
 	BPF_PROG_SEC("lwt_xmit",		BPF_PROG_TYPE_LWT_XMIT),
 	BPF_PROG_SEC("lwt_seg6local",		BPF_PROG_TYPE_LWT_SEG6LOCAL),
+	BPF_PROG_SEC("io_cost",			BPF_PROG_TYPE_IO_COST),
 	BPF_APROG_SEC("cgroup_skb/ingress",	BPF_PROG_TYPE_CGROUP_SKB,
 						BPF_CGROUP_INET_INGRESS),
 	BPF_APROG_SEC("cgroup_skb/egress",	BPF_PROG_TYPE_CGROUP_SKB,
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 5e2aa83f637a..024831756151 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -101,6 +101,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_SK_REUSEPORT:
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_IO_COST:
 	default:
 		break;
 	}
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 66f2dca1dee1..c28f308c9575 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
 	test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
 	test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \
-	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl
+	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl iocost_ctrl
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
diff --git a/tools/testing/selftests/bpf/iocost_ctrl.c b/tools/testing/selftests/bpf/iocost_ctrl.c
new file mode 100644
index 000000000000..d9d3eb70d0ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/iocost_ctrl.c
@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <linux/fs.h>
+
+int main(int argc, char **argv)
+{
+	struct bpf_object *obj;
+	int dev_fd, prog_fd = -1;
+
+	if (argc < 2) {
+		fprintf(stderr, "Usage: iocost-attach BLKDEV [BPF_PROG]");
+		return 1;
+	}
+
+	dev_fd = open(argv[1], O_RDONLY);
+	if (dev_fd < 0) {
+		perror("open(BLKDEV)");
+		return 1;
+	}
+
+	if (argc > 2) {
+		if (bpf_prog_load(argv[2], BPF_PROG_TYPE_IO_COST,
+				  &obj, &prog_fd)) {
+			perror("bpf_prog_load(BPF_PROG)");
+			return 1;
+		}
+	}
+
+	if (ioctl(dev_fd, BLKBPFIOCOST, (long)prog_fd)) {
+		perror("ioctl(BLKBPFIOCOST)");
+		return 1;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/iocost_linear_prog.c b/tools/testing/selftests/bpf/progs/iocost_linear_prog.c
new file mode 100644
index 000000000000..4e202c595658
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iocost_linear_prog.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define REQ_OP_READ	0
+#define REQ_OP_WRITE	1
+#define REQ_OP_BITS	8
+#define REQ_OP_MASK	((1 << REQ_OP_BITS) - 1)
+
+#define LCOEF_RSEQIO	14663889
+#define LCOEF_RRANDIO	248752010
+#define LCOEF_RPAGE	28151808
+#define LCOEF_WSEQIO	32671670
+#define LCOEF_WRANDIO	63150006
+#define LCOEF_WPAGE	7323648
+
+#define RAND_IO_CUTOFF	10
+
+SEC("io_cost")
+int func(struct bpf_io_cost *ctx)
+{
+	int op;
+	__u64 seqio, randio, page;
+	__s64 delta;
+
+	switch (ctx->opf & REQ_OP_MASK) {
+	case REQ_OP_READ:
+		seqio = LCOEF_RSEQIO;
+		randio = LCOEF_RRANDIO;
+		page = LCOEF_RPAGE;
+		break;
+	case REQ_OP_WRITE:
+		seqio = LCOEF_WSEQIO;
+		randio = LCOEF_WRANDIO;
+		page = LCOEF_WPAGE;
+		break;
+	default:
+		return 0;
+	}
+
+	delta = ctx->sector - ctx->last_sector;
+	if (delta >= -RAND_IO_CUTOFF && delta <= RAND_IO_CUTOFF)
+		ctx->cost += seqio;
+	else
+		ctx->cost += randio;
+	if (!ctx->is_merge)
+		ctx->cost += page * (ctx->nr_sectors >> 3);
+
+	return 0;
+}
-- 
2.17.1





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]     [Monitors]

  Powered by Linux