From: Ankit Kumar <ankit.kumar@xxxxxxxxxxx> defined in NVM Express TP4053. Added a new FIO option zone_append. When zone_append option is enabled, the existing write path will send Zone Append command with offset as start of the Zone. Signed-off-by: Krishna Kanth Reddy <krish.reddy@xxxxxxxxxxx> --- HOWTO | 7 +++++ fio.1 | 7 +++++ io_u.c | 4 +-- io_u.h | 10 +++++-- ioengines.c | 4 +-- options.c | 10 +++++++ thread_options.h | 2 ++ zbd.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- zbd.h | 13 +++++--- 9 files changed, 131 insertions(+), 16 deletions(-) diff --git a/HOWTO b/HOWTO index 8cf8d65..62b5ac8 100644 --- a/HOWTO +++ b/HOWTO @@ -1010,6 +1010,13 @@ Target file/device :option:`zonesize` bytes of data have been transferred. This parameter must be zero for :option:`zonemode` =zbd. +.. option:: zone_append=bool + + For :option:`zonemode` =zbd and for :option:`rw` =write or :option: + `rw` =randwrite, if zone_append is enabled, the the io_u points to the + starting offset of a zone. On successful completion the multiple of + sectors relative to the zone's starting offset is returned. + .. option:: read_beyond_wp=bool This parameter applies to :option:`zonemode` =zbd only. diff --git a/fio.1 b/fio.1 index f134e0b..09add8f 100644 --- a/fio.1 +++ b/fio.1 @@ -782,6 +782,13 @@ sequential workloads and ignored for random workloads. For read workloads, see also \fBread_beyond_wp\fR. .TP +.BI zone_append +For \fBzonemode\fR =zbd and for \fBrw\fR=write or \fBrw\fR=randwrite, if +zone_append is enabled, the io_u points to the starting offset of a zone. On +successful completion the multiple of sectors relative to the zone's starting +offset is returned. + +.TP .BI read_beyond_wp \fR=\fPbool This parameter applies to \fBzonemode=zbd\fR only. diff --git a/io_u.c b/io_u.c index 7f50906..b891a9b 100644 --- a/io_u.c +++ b/io_u.c @@ -778,7 +778,7 @@ void put_io_u(struct thread_data *td, struct io_u *io_u) { const bool needs_lock = td_async_processing(td); - zbd_put_io_u(io_u); + zbd_put_io_u(td, io_u); if (td->parent) td = td->parent; @@ -1342,7 +1342,7 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u) if (!fill_io_u(td, io_u)) break; - zbd_put_io_u(io_u); + zbd_put_io_u(td, io_u); put_file_log(td, f); td_io_close_file(td, f); diff --git a/io_u.h b/io_u.h index 87c2920..f5b24fd 100644 --- a/io_u.h +++ b/io_u.h @@ -94,19 +94,25 @@ struct io_u { }; /* + * for zone append this is the start offset of the zone. + */ + unsigned long long zone_start_offset; + + /* * ZBD mode zbd_queue_io callback: called after engine->queue operation * to advance a zone write pointer and eventually unlock the I/O zone. * @q indicates the I/O queue status (busy, queued or completed). * @success == true means that the I/O operation has been queued or * completed successfully. */ - void (*zbd_queue_io)(struct io_u *, int q, bool success); + void (*zbd_queue_io)(struct thread_data *, struct io_u *, int q, + bool success); /* * ZBD mode zbd_put_io callback: called in after completion of an I/O * or commit of an async I/O to unlock the I/O target zone. */ - void (*zbd_put_io)(const struct io_u *); + void (*zbd_put_io)(struct thread_data *, const struct io_u *); /* * Callback for io completion diff --git a/ioengines.c b/ioengines.c index 2c7a0df..81ac846 100644 --- a/ioengines.c +++ b/ioengines.c @@ -328,7 +328,7 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) } ret = td->io_ops->queue(td, io_u); - zbd_queue_io_u(io_u, ret); + zbd_queue_io_u(td, io_u, ret); unlock_file(td, io_u->file); @@ -370,7 +370,7 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) if (!td->io_ops->commit) { io_u_mark_submit(td, 1); io_u_mark_complete(td, 1); - zbd_put_io_u(io_u); + zbd_put_io_u(td, io_u); } if (ret == FIO_Q_COMPLETED) { diff --git a/options.c b/options.c index 85a0f49..d54da81 100644 --- a/options.c +++ b/options.c @@ -3317,6 +3317,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { }, }, { + .name = "zone_append", + .lname = "zone_append", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, zone_append), + .help = "Use Zone Append for Zone block device", + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_ZONE, + }, + { .name = "zonesize", .lname = "Zone size", .type = FIO_OPT_STR_VAL, diff --git a/thread_options.h b/thread_options.h index 968ea0a..45c5ef8 100644 --- a/thread_options.h +++ b/thread_options.h @@ -195,6 +195,7 @@ struct thread_options { unsigned long long zone_size; unsigned long long zone_skip; enum fio_zone_mode zone_mode; + unsigned int zone_append; unsigned long long lockmem; enum fio_memtype mem_type; unsigned int mem_align; @@ -631,6 +632,7 @@ struct thread_options_pack { uint32_t allow_mounted_write; uint32_t zone_mode; + uint32_t zone_append; } __attribute__((packed)); extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top); diff --git a/zbd.c b/zbd.c index 8cf8f81..ffdb766 100644 --- a/zbd.c +++ b/zbd.c @@ -455,6 +455,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) for (i = 0; i < nrz; i++, j++, z++, p++) { mutex_init_pshared_with_type(&p->mutex, PTHREAD_MUTEX_RECURSIVE); + cond_init_pshared(&p->reset_cond); p->start = z->start; switch (z->cond) { case ZBD_ZONE_COND_NOT_WP: @@ -469,6 +470,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) } p->type = z->type; p->cond = z->cond; + p->pending_ios = 0; if (j > 0 && p->start != p[-1].start + zone_size) { log_info("%s: invalid zone data\n", f->file_name); @@ -1196,20 +1198,24 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, /** * zbd_queue_io - update the write pointer of a sequential zone + * @td: fio thread data. * @io_u: I/O unit * @success: Whether or not the I/O unit has been queued successfully * @q: queueing status (busy, completed or queued). * * For write and trim operations, update the write pointer of the I/O unit * target zone. + * For zone append operation, release the zone mutex */ -static void zbd_queue_io(struct io_u *io_u, int q, bool success) +static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q, + bool success) { const struct fio_file *f = io_u->file; struct zoned_block_device_info *zbd_info = f->zbd_info; struct fio_zone_info *z; uint32_t zone_idx; uint64_t zone_end; + int ret; if (!zbd_info) return; @@ -1241,6 +1247,8 @@ static void zbd_queue_io(struct io_u *io_u, int q, bool success) zbd_info->sectors_with_data += zone_end - z->wp; pthread_mutex_unlock(&zbd_info->mutex); z->wp = zone_end; + if (td->o.zone_append) + z->pending_ios++; break; case DDIR_TRIM: assert(z->wp == z->start); @@ -1250,18 +1258,22 @@ static void zbd_queue_io(struct io_u *io_u, int q, bool success) } unlock: - if (!success || q != FIO_Q_QUEUED) { + if (!success || q != FIO_Q_QUEUED || td->o.zone_append) { /* BUSY or COMPLETED: unlock the zone */ - pthread_mutex_unlock(&z->mutex); - io_u->zbd_put_io = NULL; + ret = pthread_mutex_unlock(&z->mutex); + assert(ret == 0); + if (!success || q != FIO_Q_QUEUED) + io_u->zbd_put_io = NULL; } } /** * zbd_put_io - Unlock an I/O unit target zone lock + * For zone append operation we don't hold zone lock + * @td: fio thread data. * @io_u: I/O unit */ -static void zbd_put_io(const struct io_u *io_u) +static void zbd_put_io(struct thread_data *td, const struct io_u *io_u) { const struct fio_file *f = io_u->file; struct zoned_block_device_info *zbd_info = f->zbd_info; @@ -1283,6 +1295,19 @@ static void zbd_put_io(const struct io_u *io_u) "%s: terminate I/O (%lld, %llu) for zone %u\n", f->file_name, io_u->offset, io_u->buflen, zone_idx); + if (td->o.zone_append) { + pthread_mutex_lock(&z->mutex); + if (z->pending_ios > 0) { + z->pending_ios--; + /* + * Other threads may be waiting for pending I/O's to + * complete for this zone. Notify them. + */ + if (!z->pending_ios) + pthread_cond_broadcast(&z->reset_cond); + } + } + ret = pthread_mutex_unlock(&z->mutex); assert(ret == 0); zbd_check_swd(f); @@ -1524,16 +1549,69 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) * asynchronously and since we will submit the zone * reset synchronously, wait until previously submitted * write requests have completed before issuing a - * zone reset. + * zone reset. For append request release the zone lock + * as other threads will acquire it at the time of + * zbd_put_io. */ +reset: + if (td->o.zone_append) + pthread_mutex_unlock(&zb->mutex); io_u_quiesce(td); + if (td->o.zone_append) + pthread_mutex_lock(&zb->mutex); + zb->reset_zone = 0; + if (td->o.zone_append) { + /* + * While processing the current thread queued + * requests the other thread may have already + * done zone reset so need to check zone full + * condition again. + */ + if (!zbd_zone_full(f, zb, min_bs)) + goto proceed; + /* + * Wait for the pending requests to be completed + * else we are ok to reset this zone. + */ + if (zb->pending_ios) { + pthread_cond_wait(&zb->reset_cond, &zb->mutex); + goto proceed; + } + } + if (zbd_reset_zone(td, f, zb) < 0) goto eof; + + /* Notify other threads waiting for zone mutex */ + if (td->o.zone_append) + pthread_cond_broadcast(&zb->reset_cond); + } +proceed: + /* + * Check for zone full condition again. For zone append request + * the zone may already be reset, written and full while we + * were waiting for our turn. + */ + if (zbd_zone_full(f, zb, min_bs)) { + goto reset; } + /* Make writes occur at the write pointer */ assert(!zbd_zone_full(f, zb, min_bs)); io_u->offset = zb->wp; + + /* + * Support zone append for both regular and zoned block + * device. + */ + if (td->o.zone_append) { + if (f->zbd_info->model == ZBD_NONE) + io_u->zone_start_offset = zb->wp; + else + io_u->zone_start_offset = zb->start; + } + if (!is_valid_offset(f, io_u->offset)) { dprint(FD_ZBD, "Dropped request with offset %llu\n", io_u->offset); diff --git a/zbd.h b/zbd.h index e942a7f..eac42f7 100644 --- a/zbd.h +++ b/zbd.h @@ -23,8 +23,10 @@ enum io_u_action { * struct fio_zone_info - information about a single ZBD zone * @start: zone start location (bytes) * @wp: zone write pointer location (bytes) + * @pending_ios: Number of IO's pending in this zone * @verify_block: number of blocks that have been verified for this zone * @mutex: protects the modifiable members in this structure + * @reset_cond: zone reset check condition. only relevant for zone_append. * @type: zone type (BLK_ZONE_TYPE_*) * @cond: zone state (BLK_ZONE_COND_*) * @open: whether or not this zone is currently open. Only relevant if @@ -33,8 +35,10 @@ enum io_u_action { */ struct fio_zone_info { pthread_mutex_t mutex; + pthread_cond_t reset_cond; uint64_t start; uint64_t wp; + uint32_t pending_ios; uint32_t verify_block; enum zbd_zone_type type:2; enum zbd_zone_cond cond:4; @@ -96,18 +100,19 @@ static inline void zbd_close_file(struct fio_file *f) zbd_free_zone_info(f); } -static inline void zbd_queue_io_u(struct io_u *io_u, enum fio_q_status status) +static inline void zbd_queue_io_u(struct thread_data *td, struct io_u *io_u, + enum fio_q_status status) { if (io_u->zbd_queue_io) { - io_u->zbd_queue_io(io_u, status, io_u->error == 0); + io_u->zbd_queue_io(td, io_u, status, io_u->error == 0); io_u->zbd_queue_io = NULL; } } -static inline void zbd_put_io_u(struct io_u *io_u) +static inline void zbd_put_io_u(struct thread_data *td, struct io_u *io_u) { if (io_u->zbd_put_io) { - io_u->zbd_put_io(io_u); + io_u->zbd_put_io(td, io_u); io_u->zbd_queue_io = NULL; io_u->zbd_put_io = NULL; } -- 2.7.4