[PATCH 1/4] Add Zone Append command support for NVMe Zoned Namespaces (ZNS)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Ankit Kumar <ankit.kumar@xxxxxxxxxxx>

defined in NVM Express TP4053. Added a new FIO option zone_append.
When zone_append option is enabled, the existing write path will
send Zone Append command with offset as start of the Zone.

Signed-off-by: Krishna Kanth Reddy <krish.reddy@xxxxxxxxxxx>
---
 HOWTO            |  7 +++++
 fio.1            |  7 +++++
 io_u.c           |  4 +--
 io_u.h           | 10 +++++--
 ioengines.c      |  4 +--
 options.c        | 10 +++++++
 thread_options.h |  2 ++
 zbd.c            | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 zbd.h            | 13 +++++---
 9 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/HOWTO b/HOWTO
index 8cf8d65..62b5ac8 100644
--- a/HOWTO
+++ b/HOWTO
@@ -1010,6 +1010,13 @@ Target file/device
 	:option:`zonesize` bytes of data have been transferred. This parameter
 	must be zero for :option:`zonemode` =zbd.
 
+.. option:: zone_append=bool
+
+	For :option:`zonemode` =zbd and for :option:`rw` =write or :option:
+	`rw` =randwrite, if zone_append is enabled, the the io_u points to the
+	starting offset of a zone. On successful completion the multiple of
+	sectors relative to the zone's starting offset is returned.
+
 .. option:: read_beyond_wp=bool
 
 	This parameter applies to :option:`zonemode` =zbd only.
diff --git a/fio.1 b/fio.1
index f134e0b..09add8f 100644
--- a/fio.1
+++ b/fio.1
@@ -782,6 +782,13 @@ sequential workloads and ignored for random workloads. For read workloads,
 see also \fBread_beyond_wp\fR.
 
 .TP
+.BI zone_append
+For \fBzonemode\fR =zbd and for \fBrw\fR=write or \fBrw\fR=randwrite, if
+zone_append is enabled, the io_u points to the starting offset of a zone. On
+successful completion the multiple of sectors relative to the zone's starting
+offset is returned.
+
+.TP
 .BI read_beyond_wp \fR=\fPbool
 This parameter applies to \fBzonemode=zbd\fR only.
 
diff --git a/io_u.c b/io_u.c
index 7f50906..b891a9b 100644
--- a/io_u.c
+++ b/io_u.c
@@ -778,7 +778,7 @@ void put_io_u(struct thread_data *td, struct io_u *io_u)
 {
 	const bool needs_lock = td_async_processing(td);
 
-	zbd_put_io_u(io_u);
+	zbd_put_io_u(td, io_u);
 
 	if (td->parent)
 		td = td->parent;
@@ -1342,7 +1342,7 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
 		if (!fill_io_u(td, io_u))
 			break;
 
-		zbd_put_io_u(io_u);
+		zbd_put_io_u(td, io_u);
 
 		put_file_log(td, f);
 		td_io_close_file(td, f);
diff --git a/io_u.h b/io_u.h
index 87c2920..f5b24fd 100644
--- a/io_u.h
+++ b/io_u.h
@@ -94,19 +94,25 @@ struct io_u {
 	};
 
 	/*
+	 * for zone append this is the start offset of the zone.
+	 */
+	unsigned long long zone_start_offset;
+
+	/*
 	 * ZBD mode zbd_queue_io callback: called after engine->queue operation
 	 * to advance a zone write pointer and eventually unlock the I/O zone.
 	 * @q indicates the I/O queue status (busy, queued or completed).
 	 * @success == true means that the I/O operation has been queued or
 	 * completed successfully.
 	 */
-	void (*zbd_queue_io)(struct io_u *, int q, bool success);
+	void (*zbd_queue_io)(struct thread_data *, struct io_u *, int q,
+			     bool success);
 
 	/*
 	 * ZBD mode zbd_put_io callback: called in after completion of an I/O
 	 * or commit of an async I/O to unlock the I/O target zone.
 	 */
-	void (*zbd_put_io)(const struct io_u *);
+	void (*zbd_put_io)(struct thread_data *, const struct io_u *);
 
 	/*
 	 * Callback for io completion
diff --git a/ioengines.c b/ioengines.c
index 2c7a0df..81ac846 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -328,7 +328,7 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
 	}
 
 	ret = td->io_ops->queue(td, io_u);
-	zbd_queue_io_u(io_u, ret);
+	zbd_queue_io_u(td, io_u, ret);
 
 	unlock_file(td, io_u->file);
 
@@ -370,7 +370,7 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
 	if (!td->io_ops->commit) {
 		io_u_mark_submit(td, 1);
 		io_u_mark_complete(td, 1);
-		zbd_put_io_u(io_u);
+		zbd_put_io_u(td, io_u);
 	}
 
 	if (ret == FIO_Q_COMPLETED) {
diff --git a/options.c b/options.c
index 85a0f49..d54da81 100644
--- a/options.c
+++ b/options.c
@@ -3317,6 +3317,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		},
 	},
 	{
+		.name	= "zone_append",
+		.lname	= "zone_append",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, zone_append),
+		.help	= "Use Zone Append for Zone block device",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_ZONE,
+	},
+	{
 		.name	= "zonesize",
 		.lname	= "Zone size",
 		.type	= FIO_OPT_STR_VAL,
diff --git a/thread_options.h b/thread_options.h
index 968ea0a..45c5ef8 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -195,6 +195,7 @@ struct thread_options {
 	unsigned long long zone_size;
 	unsigned long long zone_skip;
 	enum fio_zone_mode zone_mode;
+	unsigned int zone_append;
 	unsigned long long lockmem;
 	enum fio_memtype mem_type;
 	unsigned int mem_align;
@@ -631,6 +632,7 @@ struct thread_options_pack {
 	uint32_t allow_mounted_write;
 
 	uint32_t zone_mode;
+	uint32_t zone_append;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
diff --git a/zbd.c b/zbd.c
index 8cf8f81..ffdb766 100644
--- a/zbd.c
+++ b/zbd.c
@@ -455,6 +455,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 		for (i = 0; i < nrz; i++, j++, z++, p++) {
 			mutex_init_pshared_with_type(&p->mutex,
 						     PTHREAD_MUTEX_RECURSIVE);
+			cond_init_pshared(&p->reset_cond);
 			p->start = z->start;
 			switch (z->cond) {
 			case ZBD_ZONE_COND_NOT_WP:
@@ -469,6 +470,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 			}
 			p->type = z->type;
 			p->cond = z->cond;
+			p->pending_ios = 0;
 			if (j > 0 && p->start != p[-1].start + zone_size) {
 				log_info("%s: invalid zone data\n",
 					 f->file_name);
@@ -1196,20 +1198,24 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u,
 
 /**
  * zbd_queue_io - update the write pointer of a sequential zone
+ * @td: fio thread data.
  * @io_u: I/O unit
  * @success: Whether or not the I/O unit has been queued successfully
  * @q: queueing status (busy, completed or queued).
  *
  * For write and trim operations, update the write pointer of the I/O unit
  * target zone.
+ * For zone append operation, release the zone mutex
  */
-static void zbd_queue_io(struct io_u *io_u, int q, bool success)
+static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
+			 bool success)
 {
 	const struct fio_file *f = io_u->file;
 	struct zoned_block_device_info *zbd_info = f->zbd_info;
 	struct fio_zone_info *z;
 	uint32_t zone_idx;
 	uint64_t zone_end;
+	int ret;
 
 	if (!zbd_info)
 		return;
@@ -1241,6 +1247,8 @@ static void zbd_queue_io(struct io_u *io_u, int q, bool success)
 			zbd_info->sectors_with_data += zone_end - z->wp;
 		pthread_mutex_unlock(&zbd_info->mutex);
 		z->wp = zone_end;
+		if (td->o.zone_append)
+			z->pending_ios++;
 		break;
 	case DDIR_TRIM:
 		assert(z->wp == z->start);
@@ -1250,18 +1258,22 @@ static void zbd_queue_io(struct io_u *io_u, int q, bool success)
 	}
 
 unlock:
-	if (!success || q != FIO_Q_QUEUED) {
+	if (!success || q != FIO_Q_QUEUED || td->o.zone_append) {
 		/* BUSY or COMPLETED: unlock the zone */
-		pthread_mutex_unlock(&z->mutex);
-		io_u->zbd_put_io = NULL;
+		ret = pthread_mutex_unlock(&z->mutex);
+		assert(ret == 0);
+		if (!success || q != FIO_Q_QUEUED)
+			io_u->zbd_put_io = NULL;
 	}
 }
 
 /**
  * zbd_put_io - Unlock an I/O unit target zone lock
+ * For zone append operation we don't hold zone lock
+ * @td: fio thread data.
  * @io_u: I/O unit
  */
-static void zbd_put_io(const struct io_u *io_u)
+static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
 {
 	const struct fio_file *f = io_u->file;
 	struct zoned_block_device_info *zbd_info = f->zbd_info;
@@ -1283,6 +1295,19 @@ static void zbd_put_io(const struct io_u *io_u)
 	       "%s: terminate I/O (%lld, %llu) for zone %u\n",
 	       f->file_name, io_u->offset, io_u->buflen, zone_idx);
 
+	if (td->o.zone_append) {
+		pthread_mutex_lock(&z->mutex);
+		if (z->pending_ios > 0) {
+			z->pending_ios--;
+			/*
+			 * Other threads may be waiting for pending I/O's to
+			 * complete for this zone. Notify them.
+			 */
+			if (!z->pending_ios)
+				pthread_cond_broadcast(&z->reset_cond);
+		}
+	}
+
 	ret = pthread_mutex_unlock(&z->mutex);
 	assert(ret == 0);
 	zbd_check_swd(f);
@@ -1524,16 +1549,69 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			 * asynchronously and since we will submit the zone
 			 * reset synchronously, wait until previously submitted
 			 * write requests have completed before issuing a
-			 * zone reset.
+			 * zone reset. For append request release the zone lock
+			 * as other threads will acquire it at the time of
+			 * zbd_put_io.
 			 */
+reset:
+			if (td->o.zone_append)
+				pthread_mutex_unlock(&zb->mutex);
 			io_u_quiesce(td);
+			if (td->o.zone_append)
+				pthread_mutex_lock(&zb->mutex);
+
 			zb->reset_zone = 0;
+			if (td->o.zone_append) {
+				/*
+				 * While processing the current thread queued
+				 * requests the other thread may have already
+				 * done zone reset so need to check zone full
+				 * condition again.
+				 */
+				if (!zbd_zone_full(f, zb, min_bs))
+					goto proceed;
+				/*
+				 * Wait for the pending requests to be completed
+				 * else we are ok to reset this zone.
+				 */
+				if (zb->pending_ios) {
+					pthread_cond_wait(&zb->reset_cond, &zb->mutex);
+					goto proceed;
+				}
+			}
+
 			if (zbd_reset_zone(td, f, zb) < 0)
 				goto eof;
+
+			/* Notify other threads waiting for zone mutex */
+			if (td->o.zone_append)
+				pthread_cond_broadcast(&zb->reset_cond);
+		}
+proceed:
+		/*
+		 * Check for zone full condition again. For zone append request
+		 * the zone may already be reset, written and full while we
+		 * were waiting for our turn.
+		 */
+		if (zbd_zone_full(f, zb, min_bs)) {
+			goto reset;
 		}
+
 		/* Make writes occur at the write pointer */
 		assert(!zbd_zone_full(f, zb, min_bs));
 		io_u->offset = zb->wp;
+
+		/*
+		 * Support zone append for both regular and zoned block
+		 * device.
+		 */
+		if (td->o.zone_append) {
+			if (f->zbd_info->model == ZBD_NONE)
+				io_u->zone_start_offset = zb->wp;
+			else
+				io_u->zone_start_offset = zb->start;
+		}
+
 		if (!is_valid_offset(f, io_u->offset)) {
 			dprint(FD_ZBD, "Dropped request with offset %llu\n",
 			       io_u->offset);
diff --git a/zbd.h b/zbd.h
index e942a7f..eac42f7 100644
--- a/zbd.h
+++ b/zbd.h
@@ -23,8 +23,10 @@ enum io_u_action {
  * struct fio_zone_info - information about a single ZBD zone
  * @start: zone start location (bytes)
  * @wp: zone write pointer location (bytes)
+ * @pending_ios: Number of IO's pending in this zone
  * @verify_block: number of blocks that have been verified for this zone
  * @mutex: protects the modifiable members in this structure
+ * @reset_cond: zone reset check condition. only relevant for zone_append.
  * @type: zone type (BLK_ZONE_TYPE_*)
  * @cond: zone state (BLK_ZONE_COND_*)
  * @open: whether or not this zone is currently open. Only relevant if
@@ -33,8 +35,10 @@ enum io_u_action {
  */
 struct fio_zone_info {
 	pthread_mutex_t		mutex;
+	pthread_cond_t		reset_cond;
 	uint64_t		start;
 	uint64_t		wp;
+	uint32_t		pending_ios;
 	uint32_t		verify_block;
 	enum zbd_zone_type	type:2;
 	enum zbd_zone_cond	cond:4;
@@ -96,18 +100,19 @@ static inline void zbd_close_file(struct fio_file *f)
 		zbd_free_zone_info(f);
 }
 
-static inline void zbd_queue_io_u(struct io_u *io_u, enum fio_q_status status)
+static inline void zbd_queue_io_u(struct thread_data *td, struct io_u *io_u,
+				  enum fio_q_status status)
 {
 	if (io_u->zbd_queue_io) {
-		io_u->zbd_queue_io(io_u, status, io_u->error == 0);
+		io_u->zbd_queue_io(td, io_u, status, io_u->error == 0);
 		io_u->zbd_queue_io = NULL;
 	}
 }
 
-static inline void zbd_put_io_u(struct io_u *io_u)
+static inline void zbd_put_io_u(struct thread_data *td, struct io_u *io_u)
 {
 	if (io_u->zbd_put_io) {
-		io_u->zbd_put_io(io_u);
+		io_u->zbd_put_io(td, io_u);
 		io_u->zbd_queue_io = NULL;
 		io_u->zbd_put_io = NULL;
 	}
-- 
2.7.4




[Index of Archives]     [Linux Kernel]     [Linux SCSI]     [Linux IDE]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]

  Powered by Linux