Adds a test which runs many formats and reset_controllers in parallel. The intent is to expose timing holes in the controller state machine which will lead to hung task timeouts and the controller becoming unavailable. Reported by https://bugzilla.kernel.org/show_bug.cgi?id=216354 Signed-off-by: Jonathan Derrick <jonathan.derrick@xxxxxxxxx> --- I seem to have isolated the error mechanism for older kernels, but 6.2.0-rc2 reliably segfaults my QEMU instance (something else to look into) and I don't have any 'real' hardware to test this on at the moment. It looks like several passthru commands are able to enqueue prior/during/after resetting/connecting. The issue seems to be very heavily timing related, so the loop in the header is a lot more forceful in this approach. As far as the loop goes, I've noticed it will typically repro immediately or pass the whole test. tests/nvme/047 | 121 +++++++++++++++++++++++++++++++++++++++++++++ tests/nvme/047.out | 2 + 2 files changed, 123 insertions(+) create mode 100755 tests/nvme/047 create mode 100644 tests/nvme/047.out diff --git a/tests/nvme/047 b/tests/nvme/047 new file mode 100755 index 0000000..fb8609c --- /dev/null +++ b/tests/nvme/047 @@ -0,0 +1,121 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-3.0+ +# Copyright (C) 2022 Jonathan Derrick <jonathan.derrick@xxxxxxxxx> +# +# Test nvme reset controller during admin passthru +# +# Regression for issue reported by +# https://bugzilla.kernel.org/show_bug.cgi?id=216354 +# +# Simpler form: +# for i in {1..50}; do +# nvme format -f /dev/nvme0n1 & +# echo 1 > /sys/block/nvme0n1/device/reset_controller & +# done + +. tests/nvme/rc + +#restrict test to nvme-pci only +nvme_trtype=pci + +DESCRIPTION="test nvme reset controller during admin passthru" +QUICK=1 +CAN_BE_ZONED=1 + +RUN_TIME=300 +RESET_PCIE=true + +requires() { + _nvme_requires +} + +device_requires() { + _require_test_dev_is_nvme +} + +remove_and_rescan() { + local pdev=$1 + echo 1 > /sys/bus/pci/devices/"$pdev"/remove + echo 1 > /sys/bus/pci/rescan +} + +test_device() { + echo "Running ${TEST_NAME}" + + local pdev + local blkdev + local ctrldev + local sysfs + local max_timeout + local timeout + local timeleft + local start + local last_live + local i + + pdev="$(_get_pci_dev_from_blkdev)" + blkdev="${TEST_DEV_SYSFS##*/}" + ctrldev="$(echo "$blkdev" | grep -Eo 'nvme[0-9]+')" + sysfs="/sys/block/$blkdev/device" + max_timeout=$(cat /proc/sys/kernel/hung_task_timeout_secs) + timeout=$((max_timeout * 3 / 4)) + + sleep 5 + + start=$SECONDS + while [[ $((SECONDS - start)) -le $RUN_TIME ]]; do + if [[ $(cat "$sysfs/state") == "live" ]]; then + last_live=$SECONDS + fi + + # Failure case appears to stack up formats while controller is resetting/connecting + if [[ $(pgrep -cf "nvme format") -lt 100 ]]; then + for ((i=0; i<100; i++)); do + nvme format -f "$TEST_DEV" & + echo 1 > "$sysfs/reset_controller" & + done &> /dev/null + fi + + # Might have failed probe, so reset and continue test + if [[ $((SECONDS - last_live)) -gt 10 && \ + ! -c "/dev/$ctrldev" && "$RESET_PCIE" == true ]]; then + { + echo 1 > /sys/bus/pci/devices/"$pdev"/remove + echo 1 > /sys/bus/pci/rescan + } & + + timeleft=$((max_timeout - timeout)) + sleep $((timeleft < 30 ? timeleft : 30)) + if [[ ! -c "/dev/$ctrldev" ]]; then + echo "/dev/$ctrldev missing" + echo "failed to reset $ctrldev's pcie device $pdev" + break + fi + sleep 5 + continue + fi + + if [[ $((SECONDS - last_live)) -gt $timeout ]]; then + if [[ ! -c "/dev/$ctrldev" ]]; then + echo "/dev/$ctrldev missing" + break + fi + + # Assume the controller is hung and unrecoverable + if [[ -f "$sysfs/state" ]]; then + echo "nvme controller hung ($(cat "$sysfs/state"))" + break + else + echo "nvme controller hung" + break + fi + fi + done + + if [[ ! -c "/dev/$ctrldev" || $(cat "$sysfs/state") != "live" ]]; then + echo "nvme still not live after $((SECONDS - last_live)) seconds!" + fi + udevadm settle + + echo "Test complete" +} diff --git a/tests/nvme/047.out b/tests/nvme/047.out new file mode 100644 index 0000000..915d0a2 --- /dev/null +++ b/tests/nvme/047.out @@ -0,0 +1,2 @@ +Running nvme/047 +Test complete -- 2.27.0