The following changes since commit f0af0812543d3d9a4019e2979c6e32f479ee70c4: Update mailing list details in README.rst (2024-09-06 15:22:48 -0600) are available in the Git repository at: git://git.kernel.dk/fio.git master for you to fetch changes up to bcd46be2adaa4afc32b836ad6137798544a3d80a: Merge branch 'atomic-writes' (2024-09-16 20:23:06 -0600) ---------------------------------------------------------------- Alan Adamson (1): pvsync2: Support RWF_ATOMIC Jens Axboe (1): Merge branch 'atomic-writes' John Garry (8): os-linux: Define RWF_ATOMIC os: Reintroduce atomic write support libaio: Support RWF_ATOMIC io_uring: Support RWF_ATOMIC tools/fiograph: Update for atomic support doc: Document atomic command fio: Support verify_write_sequence examples: Add example for atomic write verify HOWTO.rst | 25 +++++++++++++++++++++++++ engines/io_uring.c | 5 ++++- engines/libaio.c | 7 ++++++- engines/sync.c | 9 ++++++--- examples/atomic-verify.fio | 36 ++++++++++++++++++++++++++++++++++++ fio.1 | 21 +++++++++++++++++++++ init.c | 14 ++++++++++++++ ioengines.h | 2 ++ options.c | 13 +++++++++++++ os/os-linux.h | 5 +++++ thread_options.h | 1 + tools/fiograph/fiograph.conf | 6 +++--- verify.c | 7 ++++--- 13 files changed, 140 insertions(+), 11 deletions(-) create mode 100644 examples/atomic-verify.fio --- Diff of recent changes: diff --git a/HOWTO.rst b/HOWTO.rst index a363206d..4f071484 100644 --- a/HOWTO.rst +++ b/HOWTO.rst @@ -2501,6 +2501,20 @@ with the caveat that when used on the command line, they must come after the For direct I/O, requests will only succeed if cache invalidation isn't required, file blocks are fully allocated and the disk request could be issued immediately. +.. option:: atomic=bool : [pvsync2] [libaio] [io_uring] + + This option means that writes are issued with torn-write protection, meaning + that for a power fail or kernel crash, all or none of the data from the write + will be stored, but never a mix of old and new data. Torn-write protection is + also known as atomic writes. + + This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on + a per-IO basis. + + Writes with RWF_ATOMIC set will be rejected by the kernel when the file does + not support torn-write protection. To learn a file's torn-write limits, issue + statx with STATX_WRITE_ATOMIC. + .. option:: fdp=bool : [io_uring_cmd] [xnvme] Enable Flexible Data Placement mode for write commands. @@ -3988,6 +4002,17 @@ Verification instead resets the file after the write phase and then replays I/Os for the verification phase. +.. option:: verify_write_sequence=bool + + Verify the header write sequence number. In a scenario with multiple jobs, + verification of the write sequence number may fail. Disabling this option + will mean that write sequence number checking is skipped. Doing that can be + useful for testing atomic writes, as it means that checksum verification can + still be attempted. For when :option:`atomic` is enabled, checksum + verification is expected to succeed (while write sequence checking can still + fail). + Defaults to true. + .. option:: trim_percentage=int Number of verify blocks to discard/trim. diff --git a/engines/io_uring.c b/engines/io_uring.c index 1d4a6118..96a042a8 100644 --- a/engines/io_uring.c +++ b/engines/io_uring.c @@ -392,6 +392,8 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u) sqe->rw_flags = 0; if (o->nowait) sqe->rw_flags |= RWF_NOWAIT; + if (td->o.oatomic && io_u->ddir == DDIR_WRITE) + sqe->rw_flags |= RWF_ATOMIC; /* * Since io_uring can have a submission context (sqthread_poll) @@ -1582,7 +1584,8 @@ static struct ioengine_ops ioengine_uring = { .name = "io_uring", .version = FIO_IOOPS_VERSION, .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD | - FIO_ASYNCIO_SETS_ISSUE_TIME, + FIO_ASYNCIO_SETS_ISSUE_TIME | + FIO_ATOMICWRITES, .init = fio_ioring_init, .post_init = fio_ioring_post_init, .io_u_init = fio_ioring_io_u_init, diff --git a/engines/libaio.c b/engines/libaio.c index aaccc7ce..c2d43793 100644 --- a/engines/libaio.c +++ b/engines/libaio.c @@ -110,6 +110,10 @@ static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u) io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset); if (o->nowait) iocb->aio_rw_flags |= RWF_NOWAIT; +#ifdef FIO_HAVE_RWF_ATOMIC + if (td->o.oatomic) + iocb->aio_rw_flags |= RWF_ATOMIC; +#endif } else if (ddir_sync(io_u->ddir)) io_prep_fsync(iocb, f->fd); @@ -440,7 +444,8 @@ FIO_STATIC struct ioengine_ops ioengine = { .name = "libaio", .version = FIO_IOOPS_VERSION, .flags = FIO_ASYNCIO_SYNC_TRIM | - FIO_ASYNCIO_SETS_ISSUE_TIME, + FIO_ASYNCIO_SETS_ISSUE_TIME | + FIO_ATOMICWRITES, .init = fio_libaio_init, .post_init = fio_libaio_post_init, .prep = fio_libaio_prep, diff --git a/engines/sync.c b/engines/sync.c index b8be4eb3..729d8a71 100644 --- a/engines/sync.c +++ b/engines/sync.c @@ -175,9 +175,11 @@ static enum fio_q_status fio_pvsyncio2_queue(struct thread_data *td, if (io_u->ddir == DDIR_READ) ret = preadv2(f->fd, iov, 1, io_u->offset, flags); - else if (io_u->ddir == DDIR_WRITE) + else if (io_u->ddir == DDIR_WRITE) { + if (td->o.oatomic) + flags |= RWF_ATOMIC; ret = pwritev2(f->fd, iov, 1, io_u->offset, flags); - else if (io_u->ddir == DDIR_TRIM) { + } else if (io_u->ddir == DDIR_TRIM) { do_io_u_trim(td, io_u); return FIO_Q_COMPLETED; } else @@ -476,7 +478,8 @@ static struct ioengine_ops ioengine_pvrw2 = { .open_file = generic_open_file, .close_file = generic_close_file, .get_file_size = generic_get_file_size, - .flags = FIO_SYNCIO, + .flags = FIO_SYNCIO | + FIO_ATOMICWRITES, .options = options, .option_struct_size = sizeof(struct psyncv2_options), }; diff --git a/examples/atomic-verify.fio b/examples/atomic-verify.fio new file mode 100644 index 00000000..17bcd89f --- /dev/null +++ b/examples/atomic-verify.fio @@ -0,0 +1,36 @@ +# Data verification with atomic writes +# +# Some background on atomic writes: +# +# The main selling point of atomic writes is that it is guaranteed writes +# to storage will not be torn for a power failure or kernel crash. + +# Another aspect of atomic writes is that they handle racing writes and +# reads, such that a read racing with a write will see all the data from +# the write or none. Well, SCSI and NVMe guarantee this if using +# RWF_ATOMIC, but it is not formally stated as a feature of RWF_ATOMIC. +# +# Fio verify mode can be used to prove that atomic writes can make "safe" +# racing reads and writes. This done by having many jobs in a xsum verify +# mode. In this way, xsums should be correct, although a job may be +# reading a data block written by another job; however +# verify_write_sequence must be disabled, as it cannot be helped that data +# blocks will be out of sequence between with many jobs. +# +# Atomic write limits: +# For a block device, the max block size for atomic=1 is in +# /sys/block/sdXXX/queue/atomic_write_unit_max_bytes +# or this value can also be read with a statx syscall on the bdev file. + +[write-and-verify] +rw=randwrite +bs=4k +direct=1 +ioengine=libaio +iodepth=16 +verify=crc64 +atomic=1 +verify_write_sequence=0 +numjobs=10 +# Use /dev/XXX or filename +filename=/dev/XXX diff --git a/fio.1 b/fio.1 index a4ab07ed..0fd0fb25 100644 --- a/fio.1 +++ b/fio.1 @@ -2266,6 +2266,19 @@ cached data. Currently the RWF_NOWAIT flag does not supported for cached write. For direct I/O, requests will only succeed if cache invalidation isn't required, file blocks are fully allocated and the disk request could be issued immediately. .TP +.BI (pvsync2,libaio,io_uring)atomic \fR=\fPbool +This option means that writes are issued with torn-write protection, meaning +that for a power fail or kernel crash, all or none of the data from the write +will be stored, but never a mix of old and new data. Torn-write protection is +also known as atomic writes. + +This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on +a per-IO basis. + +Writes with RWF_ATOMIC set will be rejected by the kernel when the file does +not support torn-write protection. To learn a file's torn-write limits, issue +statx with STATX_WRITE_ATOMIC. +.TP .BI (io_uring_cmd,xnvme)fdp \fR=\fPbool Enable Flexible Data Placement mode for write commands. .TP @@ -3713,6 +3726,14 @@ Enable experimental verification. Standard verify records I/O metadata for later use during the verification phase. Experimental verify instead resets the file after the write phase and then replays I/Os for the verification phase. .TP +.BI verify_write_sequence \fR=\fPbool +Verify the header write sequence number. In a scenario with multiple jobs, +verification of the write sequence number may fail. Disabling this option +will mean that write sequence number checking is skipped. Doing that can be +useful for testing atomic writes, as it means that checksum verification can +still be attempted. For when \fBatomic\fR is enabled, checksum verification +is expected to succeed (while write sequence checking can still fail). +.TP .BI trim_percentage \fR=\fPint Number of verify blocks to discard/trim. .TP diff --git a/init.c b/init.c index 414535cc..96a03d98 100644 --- a/init.c +++ b/init.c @@ -853,6 +853,20 @@ static int fixup_options(struct thread_data *td) (o->max_bs[DDIR_WRITE] % o->verify_interval)) o->verify_interval = gcd(o->min_bs[DDIR_WRITE], o->max_bs[DDIR_WRITE]); + + if (td->o.verify_only) + o->verify_write_sequence = 0; + } + + if (td->o.oatomic) { + if (!td_ioengine_flagged(td, FIO_ATOMICWRITES)) { + log_err("fio: engine does not support atomic writes\n"); + td->o.oatomic = 0; + ret |= 1; + } + + if (!td_write(td)) + td->o.oatomic = 0; } if (o->pre_read) { diff --git a/ioengines.h b/ioengines.h index b9834fec..1531cd89 100644 --- a/ioengines.h +++ b/ioengines.h @@ -96,6 +96,7 @@ enum { __FIO_RO_NEEDS_RW_OPEN, /* open files in rw mode even if we have a read job; only affects ioengines using generic_open_file */ __FIO_MULTI_RANGE_TRIM, /* ioengine supports trim with more than one range */ + __FIO_ATOMICWRITES, /* ioengine supports atomic writes */ __FIO_IOENGINE_F_LAST, /* not a real bit; used to count number of bits */ }; @@ -120,6 +121,7 @@ enum fio_ioengine_flags { FIO_SKIPPABLE_IOMEM_ALLOC = 1 << __FIO_SKIPPABLE_IOMEM_ALLOC, FIO_RO_NEEDS_RW_OPEN = 1 << __FIO_RO_NEEDS_RW_OPEN, FIO_MULTI_RANGE_TRIM = 1 << __FIO_MULTI_RANGE_TRIM, + FIO_ATOMICWRITES = 1 << __FIO_ATOMICWRITES, }; /* diff --git a/options.c b/options.c index 5a6b0a06..c35878f7 100644 --- a/options.c +++ b/options.c @@ -2926,6 +2926,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_TYPE, }, +#ifdef FIO_HAVE_RWF_ATOMIC { .name = "atomic", .lname = "Atomic I/O", @@ -2936,6 +2937,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_TYPE, }, +#endif { .name = "buffered", .lname = "Buffered I/O", @@ -3395,6 +3397,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_VERIFY, }, + { + .name = "verify_write_sequence", + .lname = "Verify write sequence number", + .off1 = offsetof(struct thread_options, verify_write_sequence), + .type = FIO_OPT_BOOL, + .def = "1", + .help = "Verify header write sequence number", + .parent = "verify", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_VERIFY, + }, #ifdef FIO_HAVE_TRIM { .name = "trim_percentage", diff --git a/os/os-linux.h b/os/os-linux.h index 4d150311..ead8295c 100644 --- a/os/os-linux.h +++ b/os/os-linux.h @@ -62,6 +62,7 @@ #define FIO_HAVE_BYTEORDER_FUNCS #define FIO_HAVE_PWRITEV2 #define FIO_HAVE_SHM_ATTACH_REMOVED +#define FIO_HAVE_RWF_ATOMIC #ifdef MAP_HUGETLB #define FIO_HAVE_MMAP_HUGE @@ -328,6 +329,10 @@ static inline int fio_set_sched_idle(void) #define RWF_NOWAIT 0x00000008 #endif +#ifndef RWF_ATOMIC +#define RWF_ATOMIC 0x00000040 +#endif + #ifndef RWF_WRITE_LIFE_SHIFT #define RWF_WRITE_LIFE_SHIFT 4 #define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT) diff --git a/thread_options.h b/thread_options.h index ee1e5b31..d0e0a4ae 100644 --- a/thread_options.h +++ b/thread_options.h @@ -156,6 +156,7 @@ struct thread_options { unsigned int experimental_verify; unsigned int verify_state; unsigned int verify_state_save; + unsigned int verify_write_sequence; unsigned int use_thread; unsigned int unlink; unsigned int unlink_each_loop; diff --git a/tools/fiograph/fiograph.conf b/tools/fiograph/fiograph.conf index 122f2baf..75712180 100644 --- a/tools/fiograph/fiograph.conf +++ b/tools/fiograph/fiograph.conf @@ -51,13 +51,13 @@ specific_options=https http_host http_user http_pass http_s3_key http_s3_ke specific_options=ime_psync ime_psyncv [ioengine_io_uring] -specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async +specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async atomic [ioengine_io_uring_cmd] specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async cmd_type md_per_io_size pi_act pi_chk apptag apptag_mask [ioengine_libaio] -specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait +specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait atomic [ioengine_libblkio] specific_options=libblkio_driver libblkio_path libblkio_pre_connect_props libblkio_num_entries libblkio_queue_size libblkio_pre_start_props hipri libblkio_vectored libblkio_write_zeroes_on_trim libblkio_wait_mode libblkio_force_enable_completion_eventfd @@ -99,7 +99,7 @@ specific_options=hostname bindname port verb specific_options=hipri readfua writefua sg_write_mode stream_id [ioengine_pvsync2] -specific_options=hipri hipri_percentage nowait sync psync vsync pvsync +specific_options=hipri hipri_percentage nowait sync psync vsync pvsync atomic [ioengine_xnvme] specific_options=hipri sqthread_poll xnvme_be xnvme_async xnvme_sync xnvme_admin xnvme_dev_nsid xnvme_iovec diff --git a/verify.c b/verify.c index b2fede24..f3d228ba 100644 --- a/verify.c +++ b/verify.c @@ -848,12 +848,13 @@ static int verify_header(struct io_u *io_u, struct thread_data *td, /* * For read-only workloads, the program cannot be certain of the * last numberio written to a block. Checking of numberio will be - * done only for workloads that write data. For verify_only, - * numberio check is skipped. + * done only for workloads that write data. For verify_only or + * any mode de-selecting verify_write_sequence, numberio check is + * skipped. */ if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) && !td->o.time_based) - if (!td->o.verify_only) + if (td->o.verify_write_sequence) if (hdr->numberio != io_u->numberio) { log_err("verify: bad header numberio %"PRIu16 ", wanted %"PRIu16,