Verification is not supported in ZBD mode which is codified in this assert: if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) { assert(td->o.verify == VERIFY_NONE); However, sequential write property is excellent for actually doing verification if write_list and seeds are maintained per-zone. It will work automatically with * overlapping jobs * zone resets in the middle of job * different write block sizes This also paves the way to "verify before zone reset" and "verify zeroes after wp" features. Introduce *) per-zone seed, incremented with each reset, so that patterns differ *) per-zone random generator state, reseeded with each zone reset *) per-zone write I/O list linked list is natural for sequential writes *) per-zone verify_mutex This will be used for verify-zeroes-after-wp, definitely. Currently it is more a peace of mind member otherwise list corruption asserts trigger IIRC. TODO: explain why it is needed. Delete ->verify_block -- obsoleted. There are also some funny things going on with flushing and resetting files in the middle of I/O but non-overlapping case more or less works. Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@xxxxxxxxx> --- backend.c | 44 +++++++++++++++++++++++++++++--- fio.h | 2 ++ init.c | 1 + io_u.c | 2 +- iolog.c | 24 ++++++++++++------ iolog.h | 1 + verify.c | 45 ++++++++++++++++++++++++++------- verify.h | 4 ++- zbd.c | 75 ++++++++++++++++++++++++++++--------------------------- zbd.h | 20 +++++++++++++-- 10 files changed, 158 insertions(+), 60 deletions(-) diff --git a/backend.c b/backend.c index 452975cf..05ca5dc1 100644 --- a/backend.c +++ b/backend.c @@ -48,6 +48,7 @@ #include "rate-submit.h" #include "helper_thread.h" #include "pshared.h" +#include "zbd.h" #include "zone-dist.h" static struct fio_sem *startup_sem; @@ -615,7 +616,7 @@ static enum fio_q_status io_u_submit(struct thread_data *td, struct io_u *io_u) * The main verify engine. Runs over the writes we previously submitted, * reads the blocks back in, and checks the crc/md5 of the data. */ -static void do_verify(struct thread_data *td, uint64_t verify_bytes) +static void do_verify(struct thread_data *td, uint64_t verify_bytes, struct fio_file *td_f, struct fio_zone_info *zi, bool sync) { struct fio_file *f; struct io_u *io_u; @@ -629,8 +630,12 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) * read from disk. */ for_each_file(td, f, i) { + if (td_f && f != td_f) + continue; if (!fio_file_open(f)) continue; + if (!sync) + continue; if (fio_io_sync(td, f)) break; if (file_invalidate_cache(td, f)) @@ -677,7 +682,7 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) if (!io_u) break; - if (get_next_verify(td, io_u)) { + if (get_next_verify(td, io_u, td_f, zi)) { put_io_u(td, io_u); break; } @@ -1516,6 +1521,35 @@ static uint64_t do_dry_run(struct thread_data *td) return td->bytes_done[DDIR_WRITE] + td->bytes_done[DDIR_TRIM]; } +static void do_verify_zbd(struct thread_data *td, uint64_t verify_bytes) +{ + struct fio_file *f; + unsigned int i; + + for_each_file(td, f, i) { + struct zoned_block_device_info *zbd = f->zbd_info; + bool sync = true; + + if (!zbd) + continue; + + for (uint32_t z = f->min_zone; z < f->max_zone; z++) { + struct fio_zone_info *zi = &zbd->zone_info[z]; + + if (!zbd_zone_swr(zi)) + continue; + + if (pthread_mutex_trylock(&zi->verify_mutex) != 0) { + /* Someone else is verifying this zone. */ + continue; + } + do_verify(td, verify_bytes, f, zi, sync); + pthread_mutex_unlock(&zi->verify_mutex); + sync = false; + } + } +} + struct fork_data { struct thread_data *td; struct sk_out *sk_out; @@ -1839,7 +1873,11 @@ static void *thread_main(void *data) fio_gettime(&td->start, NULL); - do_verify(td, verify_bytes); + if (td->o.zone_mode == ZONE_MODE_ZBD) { + do_verify_zbd(td, verify_bytes); + } else { + do_verify(td, verify_bytes, NULL, NULL, true); + } /* * See comment further up for why this is done here. diff --git a/fio.h b/fio.h index 20ca80e2..42df7a50 100644 --- a/fio.h +++ b/fio.h @@ -140,6 +140,7 @@ enum { FIO_RAND_POISSON2_OFF, FIO_RAND_POISSON3_OFF, FIO_RAND_PRIO_CMDS, + FIO_RAND_ZBD, FIO_RAND_NR_OFFS, }; @@ -256,6 +257,7 @@ struct thread_data { struct frand_state buf_state; struct frand_state buf_state_prev; struct frand_state dedupe_state; + struct frand_state zbd_state; struct frand_state zone_state; struct frand_state prio_state; diff --git a/init.c b/init.c index b5315334..d41a23ff 100644 --- a/init.c +++ b/init.c @@ -1029,6 +1029,7 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64) init_rand_seed(&td->poisson_state[1], td->rand_seeds[FIO_RAND_POISSON2_OFF], 0); init_rand_seed(&td->poisson_state[2], td->rand_seeds[FIO_RAND_POISSON3_OFF], 0); init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false); + init_rand_seed(&td->zbd_state, td->rand_seeds[FIO_RAND_ZBD], use64); init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false); init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false); diff --git a/io_u.c b/io_u.c index 18e94617..3cd7fb71 100644 --- a/io_u.c +++ b/io_u.c @@ -1610,7 +1610,7 @@ static bool check_get_verify(struct thread_data *td, struct io_u *io_u) get_verify = 1; } - if (get_verify && !get_next_verify(td, io_u)) { + if (get_verify && !get_next_verify(td, io_u, NULL, NULL)) { td->verify_batch--; return true; } diff --git a/iolog.c b/iolog.c index 917a446c..732861a8 100644 --- a/iolog.c +++ b/iolog.c @@ -19,6 +19,7 @@ #include "smalloc.h" #include "blktrace.h" #include "pshared.h" +#include "zbd.h" #include <netinet/in.h> #include <netinet/tcp.h> @@ -231,6 +232,7 @@ void log_io_piece(struct thread_data *td, struct io_u *io_u) ipo->file = io_u->file; ipo->offset = io_u->offset; ipo->len = io_u->buflen; + ipo->seed = io_u->rand_seed; ipo->numberio = io_u->numberio; ipo->flags = IP_F_IN_FLIGHT; @@ -241,12 +243,20 @@ void log_io_piece(struct thread_data *td, struct io_u *io_u) td->trim_entries++; } - /* - * Only sort writes if we don't have a random map in which case we need - * to check for duplicate blocks and drop the old one, which we rely on - * the rb insert/lookup for handling. - */ - if (file_randommap(td, ipo->file)) { + if (td->o.zone_mode == ZONE_MODE_ZBD) { + struct fio_file *f = ipo->file; + uint32_t z = zbd_zone_idx(f, ipo->offset); + struct fio_zone_info *zi = &f->zbd_info->zone_info[z]; + + flist_add_tail(&ipo->list, &zi->write_list); + ipo->flags |= IP_F_ONLIST; + return; + } else if (file_randommap(td, ipo->file)) { + /* + * Only sort writes if we don't have a random map in which case + * we need to check for duplicate blocks and drop the old one, + * which we rely on the rb insert/lookup for handling. + */ INIT_FLIST_HEAD(&ipo->list); flist_add_tail(&ipo->list, &td->io_hist_list); ipo->flags |= IP_F_ONLIST; @@ -322,7 +332,7 @@ void unlog_io_piece(struct thread_data *td, struct io_u *io_u) if (ipo->flags & IP_F_ONRB) rb_erase(&ipo->rb_node, &td->io_hist_tree); - else if (ipo->flags & IP_F_ONLIST) + else flist_del(&ipo->list); free(ipo); diff --git a/iolog.h b/iolog.h index 981081f9..7eddb8e0 100644 --- a/iolog.h +++ b/iolog.h @@ -211,6 +211,7 @@ struct io_piece { struct fio_file *file; }; unsigned long long offset; + uint64_t seed; unsigned short numberio; unsigned long len; unsigned int flags; diff --git a/verify.c b/verify.c index b7fa6693..025e3eb0 100644 --- a/verify.c +++ b/verify.c @@ -11,6 +11,7 @@ #include "fio.h" #include "verify.h" #include "trim.h" +#include "zbd.h" #include "lib/rand.h" #include "lib/hweight.h" #include "lib/pattern.h" @@ -54,7 +55,16 @@ void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len, if (!o->verify_pattern_bytes) { dprint(FD_VERIFY, "fill random bytes len=%u\n", len); - if (!use_seed) { + if (use_seed) { + } else if (td->o.zone_mode == ZONE_MODE_ZBD) { + struct fio_file *f = io_u->file; + uint32_t z = zbd_zone_idx(f, io_u->offset); + struct fio_zone_info *zi = &f->zbd_info->zone_info[z]; + + seed = __rand(&zi->rand_state); + if (sizeof(int) != sizeof(long *)) + seed *= __rand(&zi->rand_state); + } else { seed = __rand(&td->verify_state); if (sizeof(int) != sizeof(long *)) seed *= (unsigned long)__rand(&td->verify_state); @@ -1291,7 +1301,7 @@ void populate_verify_io_u(struct thread_data *td, struct io_u *io_u) fill_pattern_headers(td, io_u, 0, 0); } -int get_next_verify(struct thread_data *td, struct io_u *io_u) +int get_next_verify(struct thread_data *td, struct io_u *io_u, struct fio_file *td_f, struct fio_zone_info *zi) { struct io_piece *ipo = NULL; @@ -1301,7 +1311,26 @@ int get_next_verify(struct thread_data *td, struct io_u *io_u) if (io_u->file) return 0; - if (!RB_EMPTY_ROOT(&td->io_hist_tree)) { + if (zi) { + pthread_mutex_lock(&zi->mutex); + if (!flist_empty(&zi->write_list)) { + ipo = flist_first_entry(&zi->write_list, struct io_piece, list); + + /* + * Ensure that the associated IO has completed + */ + read_barrier(); + if (ipo->flags & IP_F_IN_FLIGHT) { + pthread_mutex_unlock(&zi->mutex); + goto nothing; + } + + flist_del(&ipo->list); + assert(ipo->flags & IP_F_ONLIST); + ipo->flags &= ~IP_F_ONLIST; + } + pthread_mutex_unlock(&zi->mutex); + } else if (!RB_EMPTY_ROOT(&td->io_hist_tree)) { struct fio_rb_node *n = rb_first(&td->io_hist_tree); ipo = rb_entry(n, struct io_piece, rb_node); @@ -1332,10 +1361,13 @@ int get_next_verify(struct thread_data *td, struct io_u *io_u) } if (ipo) { - td->io_hist_len--; + if (!zi) { + td->io_hist_len--; + } io_u->offset = ipo->offset; io_u->buflen = ipo->len; + io_u->rand_seed = ipo->seed; io_u->numberio = ipo->numberio; io_u->file = ipo->file; io_u_set(td, io_u, IO_U_F_VER_LIST); @@ -1363,11 +1395,6 @@ int get_next_verify(struct thread_data *td, struct io_u *io_u) free(ipo); dprint(FD_VERIFY, "get_next_verify: ret io_u %p\n", io_u); - if (!td->o.verify_pattern_bytes) { - io_u->rand_seed = __rand(&td->verify_state); - if (sizeof(int) != sizeof(long *)) - io_u->rand_seed *= __rand(&td->verify_state); - } return 0; } diff --git a/verify.h b/verify.h index 539e6f6c..f046d05b 100644 --- a/verify.h +++ b/verify.h @@ -7,6 +7,8 @@ #define FIO_HDR_MAGIC 0xacca +struct fio_zone_info; + enum { VERIFY_NONE = 0, /* no verification */ VERIFY_HDR_ONLY, /* verify header only, kept for sake of @@ -94,7 +96,7 @@ struct vhdr_xxhash { * Verify helpers */ extern void populate_verify_io_u(struct thread_data *, struct io_u *); -extern int __must_check get_next_verify(struct thread_data *td, struct io_u *); +extern int __must_check get_next_verify(struct thread_data *td, struct io_u *, struct fio_file *, struct fio_zone_info *); extern int __must_check verify_io_u(struct thread_data *, struct io_u **); extern int verify_io_u_async(struct thread_data *, struct io_u **); extern void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len, struct io_u *io_u, uint64_t seed, int use_seed); diff --git a/zbd.c b/zbd.c index df46da42..c926df15 100644 --- a/zbd.c +++ b/zbd.c @@ -118,7 +118,7 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f, * @offset: offset in bytes. If this offset is in the first zone_size bytes * past the disk size then the index of the sentinel is returned. */ -static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset) +uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset) { uint32_t zone_idx; @@ -130,15 +130,6 @@ static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset) return min(zone_idx, f->zbd_info->nr_zones); } -/** - * zbd_zone_swr - Test whether a zone requires sequential writes - * @z: zone info pointer. - */ -static inline bool zbd_zone_swr(struct fio_zone_info *z) -{ - return z->type == ZBD_ZONE_TYPE_SWR; -} - /** * zbd_zone_full - verify whether a minimum number of bytes remain in a zone * @f: file pointer. @@ -499,6 +490,11 @@ out: return ret; } +static inline bool td_use64(const struct thread_data *td) +{ + return td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64; +} + /* * Allocate zone information and store it into f->zbd_info if zonemode=zbd. * @@ -509,6 +505,7 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) struct zoned_block_device_info *zbd; enum zbd_zoned_model zbd_model; pthread_mutexattr_t attr; + uint64_t diff_seed; int ret; assert(td->o.zone_mode == ZONE_MODE_ZBD); @@ -543,6 +540,23 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) pthread_mutexattr_init(&attr); pthread_mutexattr_setpshared(&attr, true); pthread_mutex_init(&zbd->mutex, &attr); + + diff_seed = td_use64(td) + ? ~(uint64_t)0 / zbd->nr_zones + : ~(uint32_t)0 / zbd->nr_zones; + for (uint32_t z = 0; z < zbd->nr_zones; z++) { + struct fio_zone_info *zi = &zbd->zone_info[z]; + + /* + * Spread zone seeds a bit, they will be incremented + * with each reset and better stay unique. + */ + zi->seed = __rand(&td->zbd_state) + z * diff_seed; + init_rand_seed(&zi->rand_state, zi->seed, td_use64(td)); + INIT_FLIST_HEAD(&zi->write_list); + pthread_mutex_init(&zi->verify_mutex, &attr); + } + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); for (uint32_t z = 0; z < zbd->nr_zones; z++) { struct fio_zone_info *zi = &zbd->zone_info[z]; @@ -683,13 +697,26 @@ static int zbd_reset_range(struct thread_data *td, struct fio_file *f, zone_idx_e = zbd_zone_idx(f, offset + length); ze = &f->zbd_info->zone_info[zone_idx_e]; for (z = zb; z < ze; z++) { + FLIST_HEAD(write_list); + pthread_mutex_lock(&z->mutex); pthread_mutex_lock(&f->zbd_info->mutex); f->zbd_info->sectors_with_data -= z->wp - z->start; pthread_mutex_unlock(&f->zbd_info->mutex); z->wp = z->start; - z->verify_block = 0; + z->seed++; + init_rand_seed(&z->rand_state, z->seed, td_use64(td)); + flist_splice_init(&z->write_list, &write_list); pthread_mutex_unlock(&z->mutex); + + while (!flist_empty(&write_list)) { + struct io_piece *ipo = flist_first_entry(&write_list, struct io_piece, list); + + /* Data "loss"... */ + flist_del(&ipo->list); + assert(ipo->flags & IP_F_ONLIST); + free(ipo); + } } td->ts.nr_zone_resets += ze - zb; @@ -1142,27 +1169,6 @@ out: return z; } -/* The caller must hold z->mutex. */ -static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td, - struct io_u *io_u, - struct fio_zone_info *z) -{ - const struct fio_file *f = io_u->file; - const uint32_t min_bs = td->o.min_bs[DDIR_WRITE]; - - if (!zbd_open_zone(td, io_u, z - f->zbd_info->zone_info)) { - pthread_mutex_unlock(&z->mutex); - z = zbd_convert_to_open_zone(td, io_u); - assert(z); - } - - if (z->verify_block * min_bs >= f->zbd_info->zone_size) - log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block, - min_bs, (unsigned long long) f->zbd_info->zone_size); - io_u->offset = z->start + z->verify_block++ * min_bs; - return z; -} - /* * Find another zone for which @io_u fits below the write pointer. Start * searching in zones @zb + 1 .. @zl and continue searching in zones @@ -1454,10 +1460,6 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) switch (io_u->ddir) { case DDIR_READ: - if (td->runstate == TD_VERIFYING) { - zb = zbd_replay_write_order(td, io_u, zb); - goto accept; - } /* * Check that there is enough written data in the zone to do an * I/O of at least min_bs B. If there isn't, find a new zone for @@ -1532,7 +1534,6 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) } /* Reset the zone pointer if necessary */ if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) { - assert(td->o.verify == VERIFY_NONE); /* * Since previous write requests may have been submitted * asynchronously and since we will submit the zone diff --git a/zbd.h b/zbd.h index fb39fb82..013c08c9 100644 --- a/zbd.h +++ b/zbd.h @@ -23,23 +23,29 @@ enum io_u_action { * struct fio_zone_info - information about a single ZBD zone * @start: zone start location (bytes) * @wp: zone write pointer location (bytes) - * @verify_block: number of blocks that have been verified for this zone * @mutex: protects the modifiable members in this structure * @type: zone type (BLK_ZONE_TYPE_*) * @cond: zone state (BLK_ZONE_COND_*) * @open: whether or not this zone is currently open. Only relevant if * max_open_zones > 0. * @reset_zone: whether or not this zone should be reset before writing to it + * @seed: + * @rand_state: + * @write_list: + * @verify_mutex: */ struct fio_zone_info { pthread_mutex_t mutex; uint64_t start; uint64_t wp; - uint32_t verify_block; enum zbd_zone_type type:2; enum zbd_zone_cond cond:4; unsigned int open:1; unsigned int reset_zone:1; + uint64_t seed; + struct frand_state rand_state; + struct flist_head write_list; + pthread_mutex_t verify_mutex; }; /** @@ -89,6 +95,7 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u, enum fio_ddir ddir); enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u); char *zbd_write_status(const struct thread_stat *ts); +uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset); static inline void zbd_queue_io_u(struct io_u *io_u, enum fio_q_status status) { @@ -107,4 +114,13 @@ static inline void zbd_put_io_u(struct io_u *io_u) } } +/** + * zbd_zone_swr - Test whether a zone requires sequential writes + * @z: zone info pointer. + */ +static inline bool zbd_zone_swr(struct fio_zone_info *z) +{ + return z->type == ZBD_ZONE_TYPE_SWR; +} + #endif /* FIO_ZBD_H */ -- 2.26.2