On 03/06/2024 10:24, Hannes Reinecke wrote: > On 5/30/24 16:24, Ofir Gal wrote: >> skb_splice_from_iter() warns on !sendpage_ok() which results in nvme-tcp >> data transfer failure. This warning leads to hanging IO. >> >> nvme-tcp using sendpage_ok() to check the first page of an iterator in >> order to disable MSG_SPLICE_PAGES. The iterator can represent a list of >> contiguous pages. >> >> When MSG_SPLICE_PAGES is enabled skb_splice_from_iter() is being used, >> it requires all pages in the iterator to be sendable. >> skb_splice_from_iter() checks each page with sendpage_ok(). >> >> nvme_tcp_try_send_data() might allow MSG_SPLICE_PAGES when the first >> page is sendable, but the next one are not. skb_splice_from_iter() will >> attempt to send all the pages in the iterator. When reaching an >> unsendable page the IO will hang. >> >> The patch introduces a helper sendpages_ok(), it returns true if all the >> continuous pages are sendable. >> >> Drivers who want to send contiguous pages with MSG_SPLICE_PAGES may use >> this helper to check whether the page list is OK. If the helper does not >> return true, the driver should remove MSG_SPLICE_PAGES flag. >> >> >> The bug is reproducible, in order to reproduce we need nvme-over-tcp >> controllers with optimal IO size bigger than PAGE_SIZE. Creating a raid >> with bitmap over those devices reproduces the bug. >> >> In order to simulate large optimal IO size you can use dm-stripe with a >> single device. >> Script to reproduce the issue on top of brd devices using dm-stripe is >> attached below. >> >> >> I have added 3 prints to test my theory. One in nvme_tcp_try_send_data() >> and two others in skb_splice_from_iter() the first before sendpage_ok() >> and the second on !sendpage_ok(), after the warning. >> ... >> nvme_tcp: sendpage_ok, page: 0x654eccd7 (pfn: 120755), len: 262144, offset: 0 >> skbuff: before sendpage_ok - i: 0. page: 0x654eccd7 (pfn: 120755) >> skbuff: before sendpage_ok - i: 1. page: 0x1666a4da (pfn: 120756) >> skbuff: before sendpage_ok - i: 2. page: 0x54f9f140 (pfn: 120757) >> WARNING: at net/core/skbuff.c:6848 skb_splice_from_iter+0x142/0x450 >> skbuff: !sendpage_ok - page: 0x54f9f140 (pfn: 120757). is_slab: 1, page_count: 1 >> ... >> >> >> stack trace: >> ... >> WARNING: at net/core/skbuff.c:6848 skb_splice_from_iter+0x141/0x450 >> Workqueue: nvme_tcp_wq nvme_tcp_io_work >> Call Trace: >> ? show_regs+0x6a/0x80 >> ? skb_splice_from_iter+0x141/0x450 >> ? __warn+0x8d/0x130 >> ? skb_splice_from_iter+0x141/0x450 >> ? report_bug+0x18c/0x1a0 >> ? handle_bug+0x40/0x70 >> ? exc_invalid_op+0x19/0x70 >> ? asm_exc_invalid_op+0x1b/0x20 >> ? skb_splice_from_iter+0x141/0x450 >> tcp_sendmsg_locked+0x39e/0xee0 >> ? _prb_read_valid+0x216/0x290 >> tcp_sendmsg+0x2d/0x50 >> inet_sendmsg+0x43/0x80 >> sock_sendmsg+0x102/0x130 >> ? vprintk_default+0x1d/0x30 >> ? vprintk+0x3c/0x70 >> ? _printk+0x58/0x80 >> nvme_tcp_try_send_data+0x17d/0x530 >> nvme_tcp_try_send+0x1b7/0x300 >> nvme_tcp_io_work+0x3c/0xc0 >> process_one_work+0x22e/0x420 >> worker_thread+0x50/0x3f0 >> ? __pfx_worker_thread+0x10/0x10 >> kthread+0xd6/0x100 >> ? __pfx_kthread+0x10/0x10 >> ret_from_fork+0x3c/0x60 >> ? __pfx_kthread+0x10/0x10 >> ret_from_fork_asm+0x1b/0x30 >> ... >> >> --- >> Changelog: >> v2, fix typo in patch subject >> >> Ofir Gal (4): >> net: introduce helper sendpages_ok() >> nvme-tcp: use sendpages_ok() instead of sendpage_ok() >> drbd: use sendpages_ok() to instead of sendpage_ok() >> libceph: use sendpages_ok() to instead of sendpage_ok() >> >> drivers/block/drbd/drbd_main.c | 2 +- >> drivers/nvme/host/tcp.c | 2 +- >> include/linux/net.h | 20 ++++++++++++++++++++ >> net/ceph/messenger_v1.c | 2 +- >> net/ceph/messenger_v2.c | 2 +- >> 5 files changed, 24 insertions(+), 4 deletions(-) >> >> reproduce.sh | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 114 insertions(+) >> create mode 100755 reproduce.sh >> >> diff --git a/reproduce.sh b/reproduce.sh >> new file mode 100755 >> index 000000000..8ae226b18 >> --- /dev/null >> +++ b/reproduce.sh >> @@ -0,0 +1,114 @@ >> +#!/usr/bin/env sh >> +# SPDX-License-Identifier: MIT >> + >> +set -e >> + >> +load_modules() { >> + modprobe nvme >> + modprobe nvme-tcp >> + modprobe nvmet >> + modprobe nvmet-tcp >> +} >> + >> +setup_ns() { >> + local dev=$1 >> + local num=$2 >> + local port=$3 >> + ls $dev > /dev/null >> + >> + mkdir -p /sys/kernel/config/nvmet/subsystems/$num >> + cd /sys/kernel/config/nvmet/subsystems/$num >> + echo 1 > attr_allow_any_host >> + >> + mkdir -p namespaces/$num >> + cd namespaces/$num/ >> + echo $dev > device_path >> + echo 1 > enable >> + >> + ln -s /sys/kernel/config/nvmet/subsystems/$num \ >> + /sys/kernel/config/nvmet/ports/$port/subsystems/ >> +} >> + >> +setup_port() { >> + local num=$1 >> + >> + mkdir -p /sys/kernel/config/nvmet/ports/$num >> + cd /sys/kernel/config/nvmet/ports/$num >> + echo "127.0.0.1" > addr_traddr >> + echo tcp > addr_trtype >> + echo 8009 > addr_trsvcid >> + echo ipv4 > addr_adrfam >> +} >> + >> +setup_big_opt_io() { >> + local dev=$1 >> + local name=$2 >> + >> + # Change optimal IO size by creating dm stripe >> + dmsetup create $name --table \ >> + "0 `blockdev --getsz $dev` striped 1 512 $dev 0" >> +} >> + >> +setup_targets() { >> + # Setup ram devices instead of using real nvme devices >> + modprobe brd rd_size=1048576 rd_nr=2 # 1GiB >> + >> + setup_big_opt_io /dev/ram0 ram0_big_opt_io >> + setup_big_opt_io /dev/ram1 ram1_big_opt_io >> + >> + setup_port 1 >> + setup_ns /dev/mapper/ram0_big_opt_io 1 1 >> + setup_ns /dev/mapper/ram1_big_opt_io 2 1 >> +} >> + >> +setup_initiators() { >> + nvme connect -t tcp -n 1 -a 127.0.0.1 -s 8009 >> + nvme connect -t tcp -n 2 -a 127.0.0.1 -s 8009 >> +} >> + >> +reproduce_warn() { >> + local devs=$@ >> + >> + # Hangs here >> + mdadm --create /dev/md/test_md --level=1 --bitmap=internal \ >> + --bitmap-chunk=1024K --assume-clean --run --raid-devices=2 $devs >> +} >> + >> +echo "################################### >> + >> +The script creates 2 nvme initiators in order to reproduce the bug. >> +The script doesn't know which controllers it created, choose the new nvme >> +controllers when asked. >> + >> +################################### >> + >> +Press enter to continue. >> +" >> + >> +read tmp >> + >> +echo "# Creating 2 nvme controllers for the reproduction. current nvme devices:" >> +lsblk -s | grep nvme || true >> +echo "--------------------------------- >> +" >> + >> +load_modules >> +setup_targets >> +setup_initiators >> + >> +sleep 0.1 # Wait for the new nvme ctrls to show up >> + >> +echo "# Created 2 nvme devices. nvme devices list:" >> + >> +lsblk -s | grep nvme >> +echo "--------------------------------- >> +" >> + >> +echo "# Insert the new nvme devices as separated lines. both should be with size of 1G" >> +read dev1 >> +read dev2 >> + >> +ls /dev/$dev1 > /dev/null >> +ls /dev/$dev2 > /dev/null >> + >> +reproduce_warn /dev/$dev1 /dev/$dev2 > > Can you convert that into a blktest script? Yes, will do.