Adds a test which runs many formats and reset_controllers in parallel. The intent is to expose timing holes in the controller state machine which will lead to hung task timing and the controller becoming unavailable. Reported by https://bugzilla.kernel.org/show_bug.cgi?id=216354 Signed-off-by: Jonathan Derrick <jonathan.derrick@xxxxxxxxx> --- tests/nvme/046 | 85 ++++++++++++++++++++++++++++++++++++++++++++++ tests/nvme/046.out | 2 ++ 2 files changed, 87 insertions(+) create mode 100755 tests/nvme/046 create mode 100644 tests/nvme/046.out diff --git a/tests/nvme/046 b/tests/nvme/046 new file mode 100755 index 0000000..4b47783 --- /dev/null +++ b/tests/nvme/046 @@ -0,0 +1,85 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-3.0+ +# Copyright (C) 2022 Jonathan Derrick <jonathan.derrick@xxxxxxxxx> +# +# Test nvme reset controller during admin passthru +# +# Regression for issue reported by +# https://bugzilla.kernel.org/show_bug.cgi?id=216354 + +. tests/nvme/rc + +#restrict test to nvme-pci only +nvme_trtype=pci + +DESCRIPTION="test nvme reset controller during admin passthru" +QUICK=1 +CAN_BE_ZONED=1 + +requires() { + _nvme_requires +} + +device_requires() { + _require_test_dev_is_nvme +} + +test_device() { + echo "Running ${TEST_NAME}" + + local sysfs + local attr + local m + + sysfs="$TEST_DEV_SYSFS/device" + timeout=$(($(cat /proc/sys/kernel/hung_task_timeout_secs) / 2)) + + sleep 5 + + if [[ ! -d "$sysfs" ]]; then + echo "$sysfs doesn't exist" + fi + + # do reset controller/format loops + # don't check status now because a timing race is desired + i=0 + start=0 + timing_out=false + while [[ $i -le 1000 ]]; do + start=$SECONDS + if [[ -f "$sysfs/reset_controller" ]]; then + echo 1 > "$sysfs/reset_controller" 2>/dev/null & + i=$((i+1)) + fi + nvme format -l 0 -f $TEST_DEV 2>/dev/null & + + #Assume the controller is hung and unrecoverable + if [[ $(($SECONDS - $start)) -gt $timeout ]]; then + echo "nvme controller timing out" + timing_out=true + break + fi + done + + { kill $!; wait; } &> /dev/null + + # at this point it may have waited hung_task_timeout / 2 already, so + # only wait 25% longer for a total of about 75% of allowed timeout + m=0 + while [[ $m -le $((timeout / 2)) ]]; do + if [[ $timing_out == true ]]; then + break + fi + if grep -q live "$sysfs/state"; then + break + fi + sleep 1 + m=$((m+1)) + done + if ! grep -q live "$sysfs/state"; then + echo "nvme still not live after $(($SECONDS - $start)) seconds!" + fi + udevadm settle + + echo "Test complete" +} diff --git a/tests/nvme/046.out b/tests/nvme/046.out new file mode 100644 index 0000000..2b5fa6a --- /dev/null +++ b/tests/nvme/046.out @@ -0,0 +1,2 @@ +Running nvme/046 +Test complete -- 2.31.1