This commit adds a new test for the kernel ULP DDP (Direct Data Placement) feature with NVMe-TCP. Configuration of DDP is per NIC and is done through a script in the kernel source. For this reason we add 2 new config vars: - KERNELSRC: path to the running kernel sources - NVME_IFACE: name of the network interface to configure the offload on Signed-off-by: Aurelien Aptel <aaptel@xxxxxxxxxx> Signed-off-by: Shai Malin smalin@xxxxxxxxxx Reviewed-by: Daniel Wagner <dwagner@xxxxxxx> --- Documentation/running-tests.md | 9 ++ README.md | 1 + common/rc | 8 + tests/nvme/055 | 285 +++++++++++++++++++++++++++++++++ tests/nvme/055.out | 44 +++++ tests/nvme/rc | 8 + 6 files changed, 355 insertions(+) create mode 100755 tests/nvme/055 create mode 100644 tests/nvme/055.out diff --git a/Documentation/running-tests.md b/Documentation/running-tests.md index fe4f729..a42fc91 100644 --- a/Documentation/running-tests.md +++ b/Documentation/running-tests.md @@ -124,6 +124,15 @@ The NVMe tests can be additionally parameterized via environment variables. be skipped and this script gets called. This makes it possible to run the fabric nvme tests against a real target. +#### NVMe-TCP zero-copy offload + +The NVMe-TCP ZC offload tests use a couple more variables. + +- KERNELSRC: Path to running kernel sources. + Needed for the script to configure the offload. +- NVME_IFACE: Name of the interface the offload should be enabled on. + This should be the same interface the NVMe connection is made with. + ### Running nvme-rdma and SRP tests These tests will use the siw (soft-iWARP) driver by default. The rdma_rxe diff --git a/README.md b/README.md index 55227d9..5073510 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ Some tests require the following: - nbd-client and nbd-server (Debian) or nbd (Fedora, openSUSE, Arch Linux) - dmsetup (Debian) or device-mapper (Fedora, openSUSE, Arch Linux) - rublk (`cargo install --version=^0.1 rublk`) for ublk test +- python3, ethtool, iproute2 for nvme-tcp zero-copy offload test Build blktests with `make`. Optionally, install it to a known location with `make install` (`/usr/local/blktests` by default, but this can be changed by diff --git a/common/rc b/common/rc index b2e68b2..0c8b51f 100644 --- a/common/rc +++ b/common/rc @@ -148,6 +148,14 @@ _have_loop() { _have_driver loop && _have_program losetup } +_have_kernel_source() { + if [ -z "${KERNELSRC}" ]; then + SKIP_REASONS+=("KERNELSRC not set") + return 1 + fi + return 0 +} + _have_blktrace() { # CONFIG_BLK_DEV_IO_TRACE might still be disabled, but this is easier # to check. We can fix it if someone complains. diff --git a/tests/nvme/055 b/tests/nvme/055 new file mode 100755 index 0000000..7e76126 --- /dev/null +++ b/tests/nvme/055 @@ -0,0 +1,285 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-3.0+ +# Copyright (C) 2024 Aurelien Aptel <aaptel@xxxxxxxxxx> +# +# zero-copy offload + +. tests/nvme/rc + +DESCRIPTION="enable zero copy offload and run rw traffic" +TIMED=1 + +iface_idx="" + +# these vars get updated after each call to connect_run_disconnect() +nb_packets=0 +nb_bytes=0 +nb_offload_packets=0 +nb_offload_bytes=0 +offload_bytes_ratio=0 +offload_packets_ratio=0 + +requires() { + _nvme_requires + _require_remote_nvme_target + _require_nvme_trtype tcp + _have_kernel_option ULP_DDP + # require nvme-tcp as a module to be able to change the ddp_offload param + _have_module nvme_tcp && _have_module_param nvme_tcp ddp_offload + _have_fio + _have_program ip + _have_program ethtool + _have_kernel_source && have_netlink_cli && _have_program python3 + have_iface +} + +have_netlink_cli() { + local cli + cli="${KERNELSRC}/tools/net/ynl/cli.py" + + if ! [ -f "$cli" ]; then + SKIP_REASONS+=("Kernel sources do not have tools/net/ynl/cli.py") + return 1 + fi + + if ! "$cli" -h &> /dev/null; then + SKIP_REASONS+=("Cannot run the kernel tools/net/ynl/cli.py") + return 1; + fi + + if ! [ -f "${KERNELSRC}/Documentation/netlink/specs/ulp_ddp.yaml" ]; then + SKIP_REASONS+=("Kernel sources do not have the ULP DDP netlink specs") + return 1 + fi +} + +have_iface() { + if [ -z "${NVME_IFACE}" ]; then + SKIP_REASONS+=("NVME_IFACE not set") + return 1 + fi + return 0 +} + +set_conditions() { + _set_nvme_trtype "$@" +} + +netlink_cli() { + "${KERNELSRC}/tools/net/ynl/cli.py" \ + --spec "${KERNELSRC}/Documentation/netlink/specs/ulp_ddp.yaml" \ + "$@" +} + +eth_stat() { + ethtool -S "${NVME_IFACE}" | awk "/ $1:/ { print \$2 }" +} + +ddp_stat() { + netlink_cli --do stats-get --json "{\"ifindex\": $iface_idx}" \ + | awk -F: "/'$1'/{print \$2;}" | tr -d '{},' +} + +ddp_caps() { + local out + out="$(netlink_cli --do caps-get --json "{\"ifindex\": $iface_idx}")" + echo "$out" | tr '{},' '\n' | tr -d ' '| awk -F: "/$1/ { print \$2 }" +} + +configure_ddp() { + local mod_param + local cap + + mod_param=$1 + cap=$2 + + echo "=== configured with ddp_offload=$mod_param and caps=$cap ===" + + # set ddp_offload module param + modprobe -q -r nvme-tcp + modprobe -q nvme-tcp ddp_offload=$mod_param + + # set capabilities + netlink_cli --do caps-set --json "{\"ifindex\": $iface_idx, \"wanted\": $cap, \"wanted_mask\": 3}" >> "$FULL" 2>&1 +} + +connect_run_disconnect() { + local io_size + local nvme_dev + local nb_drop + local drop_ratio + local nb_resync + local resync_ratio + + # offload stat counters + local start_sk_add + local start_sk_add_fail + local start_sk_del + local start_setup + local start_setup_fail + local start_teardown + local start_off_bytes + local start_eth_bytes + local start_off_packets + local start_eth_packets + local end_sk_add + local end_sk_add_fail + local end_sk_del + local end_setup + local end_setup_fail + local end_teardown + local end_drop + local end_resync + local end_off_bytes + local end_eth_bytes + local end_off_packets + local end_eth_packets + + io_size=$1 + + start_sk_add=$(ddp_stat rx-nvme-tcp-sk-add) + start_sk_add_fail=$(ddp_stat rx-nvme-tcp-sk-add-fail) + start_sk_del=$(ddp_stat rx-nvme-tcp-sk-del) + start_setup=$(ddp_stat rx-nvme-tcp-setup) + start_setup_fail=$(ddp_stat rx-nvme-tcp-setup-fail) + start_teardown=$(ddp_stat rx-nvme-tcp-teardown) + start_drop=$(ddp_stat rx-nvme-tcp-drop) + start_resync=$(ddp_stat rx-nvme-tcp-resync) + start_off_packets=$(ddp_stat rx-nvme-tcp-packets) + start_off_bytes=$(ddp_stat rx-nvme-tcp-bytes) + start_eth_packets=$(eth_stat rx_packets) + start_eth_bytes=$(eth_stat rx_bytes) + _nvme_connect_subsys --hdr-digest --data-digest --nr-io-queues 8 + + nvme_dev="/dev/$(_find_nvme_ns "${def_subsys_uuid}")" + + local common_args=( + --blocksize_range=$io_size + --rw=randrw + --numjobs=8 + --iodepth=128 + --name=randrw + --ioengine=libaio + --time_based + --runtime="$TIMEOUT" + --direct=1 + --invalidate=1 + --randrepeat=1 + --norandommap + --filename="$nvme_dev" + ) + + echo "IO size: $io_size" + + _run_fio "${common_args[@]}" + _nvme_disconnect_subsys >> "$FULL" 2>&1 + + end_sk_add=$(ddp_stat rx-nvme-tcp-sk-add) + end_sk_add_fail=$(ddp_stat rx-nvme-tcp-sk-add-fail) + end_sk_del=$(ddp_stat rx-nvme-tcp-sk-del) + end_setup=$(ddp_stat rx-nvme-tcp-setup) + end_setup_fail=$(ddp_stat rx-nvme-tcp-setup-fail) + end_teardown=$(ddp_stat rx-nvme-tcp-teardown) + end_drop=$(ddp_stat rx-nvme-tcp-drop) + end_resync=$(ddp_stat rx-nvme-tcp-resync) + end_off_packets=$(ddp_stat rx-nvme-tcp-packets) + end_eth_packets=$(eth_stat rx_packets) + end_off_bytes=$(ddp_stat rx-nvme-tcp-bytes) + end_eth_bytes=$(eth_stat rx_bytes) + + echo "Offloaded sockets: $((end_sk_add - start_sk_add))" + echo "Failed sockets: $((end_sk_add_fail - start_sk_add_fail))" + echo "Unoffloaded sockets: $((end_sk_del - start_sk_del))" + echo "Offload packet leaked: $((end_setup - end_teardown))" + echo "Failed packet setup: $((end_setup_fail - start_setup_fail))" + + # global var results + nb_drop=$(( end_drop - start_drop )) + nb_resync=$(( end_resync - start_resync )) + nb_packets=$(( end_eth_packets - start_eth_packets )) + nb_offload_packets=$(( end_off_packets - start_off_packets )) + nb_bytes=$(( end_eth_bytes - start_eth_bytes )) + nb_offload_bytes=$(( end_off_bytes - start_off_bytes )) + + offload_packets_ratio=0 + offload_bytes_ratio=0 + + # sanity check and avoid div by zero in ratio calculation + if [[ nb_bytes -eq 0 || nb_packets -eq 0 ]]; then + echo "No traffic: $nb_bytes bytes, $nb_packets packets" + return + fi + + offload_packets_ratio=$(( nb_offload_packets*100/nb_packets )) + offload_bytes_ratio=$(( nb_offload_bytes*100/nb_bytes )) + + drop_ratio=$(( nb_drop*100/nb_packets )) + resync_ratio=$(( nb_resync*100/nb_packets )) + [[ drop_ratio -gt 5 ]] && echo "High drop ratio: $drop_ratio %" + [[ resync_ratio -gt 5 ]] && echo "High resync ratio: $resync_ratio %" +} + +test() { + local starting_ddp_config + + : "${TIMEOUT:=30}" + + echo "Running ${TEST_NAME}" + + # get iface index + iface_idx=$(ip address | awk -F: "/${NVME_IFACE}/ { print \$1; exit; }") + + # check hw supports ddp + if [[ $(( $(ddp_caps hw) & 3)) -ne 3 ]]; then + SKIP_REASONS+=("${NVME_IFACE} does not support nvme-tcp ddp offload") + return + fi + + _setup_nvmet + _nvmet_target_setup + + if [ "$(cat "/sys/module/nvme_tcp/parameters/ddp_offload")" = Y ]; then + starting_ddp_config="1 $(ddp_caps active)" + else + starting_ddp_config="0 $(ddp_caps active)" + fi + + # if any of the offload knobs are disabled, no offload should occur + # and offloaded packets & bytes should be zero + + configure_ddp 0 0 + connect_run_disconnect 32k-1M + echo "Offloaded packets: $nb_offload_packets" + echo "Offloaded bytes: $nb_offload_bytes" + + configure_ddp 0 3 + connect_run_disconnect 32k-1M + echo "Offloaded packets: $nb_offload_packets" + echo "Offloaded bytes: $nb_offload_bytes" + + configure_ddp 1 0 + connect_run_disconnect 32k-1M + echo "Offloaded packets: $nb_offload_packets" + echo "Offloaded bytes: $nb_offload_bytes" + + # if everything is enabled, the offload should happen for large IOs only + configure_ddp 1 3 + + connect_run_disconnect 32k-1M + [[ nb_offload_packets -lt 100 ]] && echo "Low offloaded packets: $nb_offload_packets" + [[ nb_offload_bytes -lt 32768 ]] && echo "Low offloaded bytes: $nb_offload_bytes" + [[ offload_bytes_ratio -lt 90 ]] && echo "Low offloaded bytes ratio: $offload_bytes_ratio %" + [[ offload_packets_ratio -lt 95 ]] && echo "Low offloaded packets ratio: $offload_packets_ratio %" + + # small IO should be under the offload threshold, ratio should be zero + connect_run_disconnect 4k-16k + echo "Offload bytes ratio: $offload_bytes_ratio %" + echo "Offload packets ratio: $offload_packets_ratio %" + + _nvmet_target_cleanup + + # restore starting config + configure_ddp $starting_ddp_config > /dev/null + + echo "Test complete" +} diff --git a/tests/nvme/055.out b/tests/nvme/055.out new file mode 100644 index 0000000..06706a6 --- /dev/null +++ b/tests/nvme/055.out @@ -0,0 +1,44 @@ +Running nvme/055 +=== configured with ddp_offload=0 and caps=0 === +IO size: 32k-1M +Offloaded sockets: 0 +Failed sockets: 0 +Unoffloaded sockets: 0 +Offload packet leaked: 0 +Failed packet setup: 0 +Offloaded packets: 0 +Offloaded bytes: 0 +=== configured with ddp_offload=0 and caps=3 === +IO size: 32k-1M +Offloaded sockets: 0 +Failed sockets: 0 +Unoffloaded sockets: 0 +Offload packet leaked: 0 +Failed packet setup: 0 +Offloaded packets: 0 +Offloaded bytes: 0 +=== configured with ddp_offload=1 and caps=0 === +IO size: 32k-1M +Offloaded sockets: 0 +Failed sockets: 0 +Unoffloaded sockets: 0 +Offload packet leaked: 0 +Failed packet setup: 0 +Offloaded packets: 0 +Offloaded bytes: 0 +=== configured with ddp_offload=1 and caps=3 === +IO size: 32k-1M +Offloaded sockets: 8 +Failed sockets: 0 +Unoffloaded sockets: 8 +Offload packet leaked: 0 +Failed packet setup: 0 +IO size: 4k-16k +Offloaded sockets: 8 +Failed sockets: 0 +Unoffloaded sockets: 8 +Offload packet leaked: 0 +Failed packet setup: 0 +Offload bytes ratio: 0 % +Offload packets ratio: 0 % +Test complete diff --git a/tests/nvme/rc b/tests/nvme/rc index d1a4c01..4a43e43 100644 --- a/tests/nvme/rc +++ b/tests/nvme/rc @@ -199,6 +199,14 @@ _require_kernel_nvme_target() { return 0 } +_require_remote_nvme_target() { + if [ -z "${nvme_target_control}" ]; then + SKIP_REASONS+=("Remote target required but NVME_TARGET_CONTROL is not set") + return 1 + fi + return 0 +} + _test_dev_nvme_ctrl() { echo "/dev/char/$(cat "${TEST_DEV_SYSFS}/device/dev")" } -- 2.34.1