Add ublk-stripe for covering both bpf aio and io split features. Signed-off-by: Ming Lei <tom.leiming@xxxxxxxxx> --- tools/testing/selftests/ublk/Makefile | 3 + .../selftests/ublk/progs/ublk_stripe.c | 319 ++++++++++++++++++ .../testing/selftests/ublk/test_stripe_01.sh | 35 ++ .../testing/selftests/ublk/test_stripe_02.sh | 26 ++ tools/testing/selftests/ublk/ublk_bpf.c | 88 ++++- 5 files changed, 468 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/ublk/progs/ublk_stripe.c create mode 100755 tools/testing/selftests/ublk/test_stripe_01.sh create mode 100755 tools/testing/selftests/ublk/test_stripe_02.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 2540ae7a75a3..7c30c5728694 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -27,6 +27,9 @@ TEST_PROGS += test_null_04.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh +TEST_PROGS += test_stripe_01.sh +TEST_PROGS += test_stripe_02.sh + # Order correspond to 'make run_tests' order TEST_GEN_PROGS_EXTENDED = ublk_bpf diff --git a/tools/testing/selftests/ublk/progs/ublk_stripe.c b/tools/testing/selftests/ublk/progs/ublk_stripe.c new file mode 100644 index 000000000000..98a59239047c --- /dev/null +++ b/tools/testing/selftests/ublk/progs/ublk_stripe.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <linux/const.h> +#include <linux/errno.h> +#include <linux/falloc.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +//#define DEBUG +#include "ublk_bpf.h" + +/* libbpf v1.4.5 is required for struct_ops to work */ + +struct ublk_stripe { +#define MAX_BACKFILES 4 + unsigned char chunk_shift; + unsigned char nr_backfiles; + int fds[MAX_BACKFILES]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 128); + __type(key, unsigned int); /* dev id */ + __type(value, struct ublk_stripe); /* stripe setting */ +} stripe_map SEC(".maps"); + +/* todo: make it writable payload of ublk_bpf_io */ +struct ublk_io_payload { + unsigned int ref; + int res; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10240); + __type(key, unsigned long long); /* dev_id + q_id + tag */ + __type(value, struct ublk_io_payload); /* io payload */ +} io_map SEC(".maps"); + +static inline void dec_stripe_io_ref(const struct ublk_bpf_io *io, struct ublk_io_payload *pv, int ret) +{ + if (!pv) + return; + + if (pv->res >= 0) + pv->res = ret; + + if (!__sync_sub_and_fetch(&pv->ref, 1)) { + unsigned rw = (io->iod->op_flags & 0xff); + + if (pv->res >= 0 && (rw <= 1)) + pv->res = io->iod->nr_sectors << 9; + ublk_bpf_complete_io(io, pv->res); + } +} + +static inline void ublk_stripe_comp_and_release_aio(struct bpf_aio *aio, int ret) +{ + struct ublk_bpf_io *io = ublk_bpf_acquire_io_from_aio(aio); + struct ublk_io_payload *pv = NULL; + unsigned long long io_key = build_io_key(io); + + if (!io) + return; + + io_key = build_io_key(io); + pv = bpf_map_lookup_elem(&io_map, &io_key); + + /* drop reference for each underlying aio */ + dec_stripe_io_ref(io, pv, ret); + ublk_bpf_release_io_from_aio(io); + + ublk_bpf_dettach_and_complete_aio(aio); + bpf_aio_release(aio); +} + +SEC("struct_ops/bpf_aio_complete_cb") +void BPF_PROG(ublk_stripe_comp_cb, struct bpf_aio *aio, long ret) +{ + BPF_DBG("aio result %d, back_file %s pos %llx", ret, + aio->iocb.ki_filp->f_path.dentry->d_name.name, + aio->iocb.ki_pos); + ublk_stripe_comp_and_release_aio(aio, ret); +} + +SEC(".struct_ops.link") +struct bpf_aio_complete_ops stripe_ublk_bpf_aio_ops = { + .id = 32, + .bpf_aio_complete_cb = (void *)ublk_stripe_comp_cb, +}; + +static inline int ublk_stripe_submit_backing_io(const struct ublk_bpf_io *io, + int backfile_fd, unsigned long backfile_off, + unsigned int backfile_bytes, + unsigned int buf_off) +{ + const struct ublksrv_io_desc *iod = io->iod; + unsigned int op_flags = 0; + struct bpf_aio *aio; + int res = -EINVAL; + int op; + + /* translate ublk opcode into backing file's */ + switch (iod->op_flags & 0xff) { + case 0 /*UBLK_IO_OP_READ*/: + op = BPF_AIO_OP_FS_READ; + break; + case 1 /*UBLK_IO_OP_WRITE*/: + op = BPF_AIO_OP_FS_WRITE; + break; + case 2 /*UBLK_IO_OP_FLUSH*/: + op = BPF_AIO_OP_FS_FSYNC; + break; + case 3 /*UBLK_IO_OP_DISCARD*/: + op = BPF_AIO_OP_FS_FALLOCATE; + op_flags = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + break; + case 4 /*UBLK_IO_OP_WRITE_SAME*/: + op = BPF_AIO_OP_FS_FALLOCATE; + op_flags = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + break; + case 5 /*UBLK_IO_OP_WRITE_ZEROES*/: + op = BPF_AIO_OP_FS_FALLOCATE; + op_flags = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; + break; + default: + return -EINVAL; + } + + res = -ENOMEM; + aio = bpf_aio_alloc(op, 0); + if (!aio) + goto fail; + + /* attach aio into the specified range of this io command */ + res = ublk_bpf_attach_and_prep_aio(io, buf_off, backfile_bytes, aio); + if (res < 0) { + bpf_printk("bpf aio attaching failed %d\n", res); + goto fail; + } + + /* submit this aio onto the backing file */ + res = bpf_aio_submit(aio, backfile_fd, backfile_off, backfile_bytes, op_flags); + if (res < 0) { + bpf_printk("aio submit failed %d\n", res); + ublk_stripe_comp_and_release_aio(aio, res); + } + return 0; +fail: + return res; +} + +static int calculate_backfile_off_bytes(const struct ublk_stripe *stripe, + unsigned long stripe_off, unsigned int stripe_bytes, + unsigned long *backfile_off, + unsigned int *backfile_bytes) +{ + unsigned long chunk_size = 1U << stripe->chunk_shift; + unsigned int nr_bf = stripe->nr_backfiles; + unsigned long unit_chunk_size = nr_bf << stripe->chunk_shift; + unsigned long start_off = stripe_off & ~(chunk_size - 1); + unsigned long unit_start_off = stripe_off & ~(unit_chunk_size - 1); + unsigned int idx = (start_off - unit_start_off) >> stripe->chunk_shift; + + *backfile_bytes = stripe_bytes; + *backfile_off = (unit_start_off / nr_bf) + (idx << stripe->chunk_shift) + (stripe_off - start_off); + + return stripe->fds[idx % MAX_BACKFILES]; +} + +static unsigned int calculate_stripe_off_bytes(const struct ublk_stripe *stripe, + const struct ublksrv_io_desc *iod, unsigned int this_off, + unsigned long *stripe_off) +{ + unsigned long off, next_off; + unsigned int chunk_size = 1U << stripe->chunk_shift; + unsigned int max_size = (iod->nr_sectors << 9) - this_off; + + off = (iod->start_sector << 9) + this_off; + next_off = (off & ~(chunk_size - 1)) + chunk_size;; + + *stripe_off = off; + + if (max_size < next_off - off) + return max_size; + return next_off - off; +} + +static inline ublk_bpf_return_t __ublk_stripe_handle_io_cmd(const struct ublk_bpf_io *io, unsigned int off) +{ + ublk_bpf_return_t ret = ublk_bpf_return_val(UBLK_BPF_IO_QUEUED, 0); + unsigned long stripe_off, backfile_off; + unsigned int stripe_bytes, backfile_bytes; + int dev_id = ublk_bpf_get_dev_id(io); + const struct ublksrv_io_desc *iod; + const struct ublk_stripe *stripe; + int res = -EINVAL; + int backfile_fd; + unsigned long long io_key = build_io_key(io); + struct ublk_io_payload pl = { + .ref = 2, + .res = 0, + }; + struct ublk_io_payload *pv = NULL; + + iod = ublk_bpf_get_iod(io); + if (!iod) { + ublk_bpf_complete_io(io, res); + return ret; + } + + BPF_DBG("ublk dev %u qid %u: handle io cmd tag %u op %u %lx-%d off %u", + ublk_bpf_get_dev_id(io), + ublk_bpf_get_queue_id(io), + ublk_bpf_get_io_tag(io), + iod->op_flags & 0xff, + iod->start_sector << 9, + iod->nr_sectors << 9, off); + + /* retrieve backing file descriptor */ + stripe = bpf_map_lookup_elem(&stripe_map, &dev_id); + if (!stripe) { + bpf_printk("can't get FD from %d\n", dev_id); + return ret; + } + + /* todo: build as big chunk as possible for each underlying files/disks */ + stripe_bytes = calculate_stripe_off_bytes(stripe, iod, off, &stripe_off); + backfile_fd = calculate_backfile_off_bytes(stripe, stripe_off, stripe_bytes, + &backfile_off, &backfile_bytes); + BPF_DBG("\t <chunk_shift %u files %u> stripe(%lx %lu) backfile(%d %lx %lu)", + stripe->chunk_shift, stripe->nr_backfiles, + stripe_off, stripe_bytes, + backfile_fd, backfile_off, backfile_bytes); + + if (!stripe_bytes) { + bpf_printk("submit bpf aio failed %d\n", res); + res = -EINVAL; + goto exit; + } + + /* grab one submission reference, and one extra for the whole batch */ + if (!off) { + res = bpf_map_update_elem(&io_map, &io_key, &pl, BPF_ANY); + if (res) { + bpf_printk("update io map element failed %d key %llx\n", res, io_key); + goto exit; + } + } else { + pv = bpf_map_lookup_elem(&io_map, &io_key); + if (pv) + __sync_fetch_and_add(&pv->ref, 1); + } + + /* handle this io command by submitting IOs on backing file */ + res = ublk_stripe_submit_backing_io(io, backfile_fd, backfile_off, backfile_bytes, off); + +exit: + /* io cmd can't be completes until this reference is dropped */ + if (res < 0) { + bpf_printk("submit bpf aio failed %d\n", res); + ublk_bpf_complete_io(io, res); + return ret; + } + + /* drop the extra reference for the whole batch */ + if (off + stripe_bytes == iod->nr_sectors << 9) { + if (!pv) + pv = bpf_map_lookup_elem(&io_map, &io_key); + dec_stripe_io_ref(io, pv, pv ? pv->res : 0); + } + + return ublk_bpf_return_val(UBLK_BPF_IO_CONTINUE, stripe_bytes); +} + +SEC("struct_ops/ublk_bpf_release_io_cmd") +void BPF_PROG(ublk_stripe_release_io_cmd, struct ublk_bpf_io *io) +{ + BPF_DBG("%s: complete io command %d", __func__, io->res); +} + +SEC("struct_ops.s/ublk_bpf_queue_io_cmd_daemon") +ublk_bpf_return_t BPF_PROG(ublk_stripe_handle_io_cmd, struct ublk_bpf_io *io, unsigned int off) +{ + return __ublk_stripe_handle_io_cmd(io, off); +} + +SEC("struct_ops/ublk_bpf_attach_dev") +int BPF_PROG(ublk_stripe_attach_dev, int dev_id) +{ + const struct ublk_stripe *stripe; + + /* retrieve backing file descriptor */ + stripe = bpf_map_lookup_elem(&stripe_map, &dev_id); + if (!stripe) { + bpf_printk("can't get FD from %d\n", dev_id); + return -EINVAL; + } + + if (stripe->nr_backfiles >= MAX_BACKFILES) + return -EINVAL; + + if (stripe->chunk_shift < 12) + return -EINVAL; + + return 0; +} + +SEC(".struct_ops.link") +struct ublk_bpf_ops stripe_ublk_bpf_ops = { + .id = 32, + .attach_dev = (void *)ublk_stripe_attach_dev, + .queue_io_cmd_daemon = (void *)ublk_stripe_handle_io_cmd, + .release_io_cmd = (void *)ublk_stripe_release_io_cmd, +}; + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh new file mode 100755 index 000000000000..3c21f7db495a --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +. test_common.sh + +TID="stripe_01" +ERR_CODE=0 + +# prepare & register and pin bpf prog +_prep_bpf_test "stripe" ublk_stripe.bpf.o + +backfile_0=`_create_backfile 256M` +backfile_1=`_create_backfile 256M` + +# add two ublk null disks with the pinned bpf prog +_add_ublk_dev -t stripe -n 0 --bpf_prog 32 --bpf_aio_prog 32 --quiet $backfile_0 $backfile_1 + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb0 \ + --ioengine=libaio --iodepth=4 \ + --rw=write \ + --size=256M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +# cleanup & unregister and unpin the bpf prog +_cleanup_bpf_test "stripe" + +_remove_backfile $backfile_0 +_remove_backfile $backfile_1 + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh new file mode 100755 index 000000000000..fdbb81dc53d8 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +. test_common.sh + +TID="stripe_02" +ERR_CODE=0 + +# prepare & register and pin bpf prog +_prep_bpf_test "stripe" ublk_stripe.bpf.o + +backfile_0=`_create_backfile 256M` +backfile_1=`_create_backfile 256M` + +# add two ublk null disks with the pinned bpf prog +_add_ublk_dev -t stripe -n 0 --bpf_prog 32 --bpf_aio_prog 32 --quiet $backfile_0 $backfile_1 + +_mkfs_mount_test /dev/ublkb0 +ERR_CODE=$? + +# cleanup & unregister and unpin the bpf prog +_cleanup_bpf_test "stripe" + +_remove_backfile $backfile_0 +_remove_backfile $backfile_1 + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/ublk_bpf.c b/tools/testing/selftests/ublk/ublk_bpf.c index c24d5e18a1b1..85b2b4a09e05 100644 --- a/tools/testing/selftests/ublk/ublk_bpf.c +++ b/tools/testing/selftests/ublk/ublk_bpf.c @@ -1283,14 +1283,14 @@ static int cmd_dev_reg_bpf(struct dev_ctx *ctx) static int cmd_dev_help(char *exe) { - printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [--bpf_prog ublk_prog_id] [--bpf_aio_prog ublk_aio_prog_id] [backfile1] [backfile2] ...\n", exe); + printf("%s add -t [null|loop|stripe] [-q nr_queues] [-d depth] [-n dev_id] [--bpf_prog ublk_prog_id] [--bpf_aio_prog ublk_aio_prog_id] [backfile1] [backfile2] ...\n", exe); printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); printf("%s del [-n dev_id] -a \n", exe); printf("\t -a delete all devices -n delete specified device\n"); printf("%s list [-n dev_id] -a \n", exe); printf("\t -a list all devices, -n list specified device, default -a \n"); - printf("%s reg -t [null|loop] bpf_prog_obj_path \n", exe); - printf("%s unreg -t [null|loop]\n", exe); + printf("%s reg -t [null|loop|stripe] bpf_prog_obj_path \n", exe); + printf("%s unreg -t [null|loop|stripe]\n", exe); return 0; } @@ -1475,6 +1475,83 @@ static int ublk_loop_tgt_init(struct ublk_dev *dev) return 0; } +struct ublk_stripe_params { + unsigned char chunk_shift; + unsigned char nr_backfiles; + int fds[MAX_BACK_FILES]; +}; + +static int stripe_bpf_setup_parameters(struct ublk_dev *dev, unsigned int chunk_shift) +{ + int dev_id = dev->dev_info.dev_id; + struct ublk_stripe_params stripe = { + .chunk_shift = chunk_shift, + .nr_backfiles = dev->nr_fds - 1, + }; + int map_fd; + int err, i; + + for (i = 0; i < stripe.nr_backfiles; i++) + stripe.fds[i] = dev->fds[i + 1]; + + map_fd = bpf_obj_get("/sys/fs/bpf/ublk/stripe/stripe_map"); + if (map_fd < 0) { + ublk_err("Error getting map file descriptor\n"); + return -EINVAL; + } + + err = bpf_map_update_elem(map_fd, &dev_id, &stripe, BPF_ANY); + if (err) { + ublk_err("Error updating map element: %d\n", errno); + return -EINVAL; + } + + return 0; +} + +static int ublk_stripe_tgt_init(struct ublk_dev *dev) +{ + unsigned long long bytes = 0; + unsigned chunk_shift = 12; + int ret, i; + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_BPF, + .basic = { + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, + }, + .bpf = { + .flags = UBLK_BPF_HAS_OPS_ID | UBLK_BPF_HAS_AIO_OPS_ID, + .ops_id = dev->bpf_prog_id, + .aio_ops_id = dev->bpf_aio_prog_id, + }, + }; + + ret = backing_file_tgt_init(dev); + if (ret) + return ret; + + assert(stripe_bpf_setup_parameters(dev, chunk_shift) == 0); + + for (i = 0; i < dev->nr_fds - 1; i++) { + unsigned long size = dev->tgt.backing_file_size[i]; + + if (size != dev->tgt.backing_file_size[0]) + return -EINVAL; + if (size & ((1 << chunk_shift) - 1)) + return -EINVAL; + bytes += size; + } + + dev->tgt.dev_size = bytes; + p.basic.dev_sectors = bytes >> 9; + dev->tgt.params = p; + + return 0; +} static const struct ublk_tgt_ops tgt_ops_list[] = { { @@ -1487,6 +1564,11 @@ static const struct ublk_tgt_ops tgt_ops_list[] = { .init_tgt = ublk_loop_tgt_init, .deinit_tgt = backing_file_tgt_deinit, }, + { + .name = "stripe", + .init_tgt = ublk_stripe_tgt_init, + .deinit_tgt = backing_file_tgt_deinit, + }, }; static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) -- 2.47.0