Alexey, Adding Keith to this thread. On 2020/03/27 5:44, Alexey Dobriyan wrote: > Add simple iodepth=1 NVMe engine: > > ioengine=nvme > > It works via standard Linux NVMe ioctls. Keith is working on splitting up nvmecli into the cli part and libnvme which uses the kernel ioctl iinterface for NVMe command passthrough. So I think it may be better to implement ioengine=libnvme using Keith libnvme library. That will remove the need to define all the NVMe command stuff here. > > It will be used for testing upcoming ZNS stuff. libnvme will have this support too. But you will also need the ioengine to be able to plug into the zonemode=zbd to avoid a lot of nightmares on how to avoid unailigned write errors with various workloads. I have a series almost ready to go out (in testing right now) to do just that for a new libzbc IO engine. This IO engine is for passthrough to SMR drives, for the exact same use case, namely, testing drives on kernels that do not have zoned block device support (e.g. a lot of customers in the field use old-ish enterprise distros with 3.x kernels where zoned block devices are not supported). > > Currently Linux doesn't recognize NVMe ZNS devices as zoned block > devices so zone ioctls (BLKRESETZONE et al) can't be used. Patches for that are ready to go out as soon as the ZNS TP is approved :) > > Passthrough ioctls should allow Zone Append and whatever commands > new specs bring. Yes, but we will need a new rw= control for that one though as implementing verify for it will not be trivial (data location on the device and write issuing offset relation is lost with zone append). > > Support read, write, fsync, fdatasync. > Don't support sync_file_range obviously. > Don't support trim for now, until I figure all qemu options and > the story behind broken qemu trim support. Using Keith's libnvme can probably simplify support for all of this. > > Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@xxxxxxxxx> > --- > > Makefile | 3 > configure | 20 +++++ > engines/nvme.c | 226 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > file.h | 1 > 4 files changed, 250 insertions(+) > > --- a/Makefile > +++ b/Makefile > @@ -163,6 +163,9 @@ endif > ifdef CONFIG_LINUX_BLKZONED > SOURCE += zbd.c > endif > +ifdef CONFIG_NVME > + SOURCE += engines/nvme.c > +endif > > ifeq ($(CONFIG_TARGET_OS), Linux) > SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \ > --- a/configure > +++ b/configure > @@ -2397,6 +2397,22 @@ if compile_prog "" "" "linux_blkzoned"; then > fi > print_config "Zoned block device support" "$linux_blkzoned" > > +########################################## > +if test "$linux_nvme" != "yes" ; then > + linux_nvme="no" > +fi > +cat >$TMPC <<EOF > +#include <linux/nvme_ioctl.h> > +int main(void) > +{ > + return 0; > +} > +EOF > +if compile_prog "" "" "linux_nvme"; then > + linux_nvme="yes" > +fi > +print_config "NVMe engine" "$linux_nvme" > + > ########################################## > # check march=armv8-a+crc+crypto > if test "$march_armv8_a_crc_crypto" != "yes" ; then > @@ -2912,6 +2928,10 @@ if test "$libnbd" = "yes" ; then > echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak > echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak > fi > +if test "$linux_nvme" = "yes" ; then > + output_sym "CONFIG_NVME" > +fi > + > cat > $TMPC << EOF > int main(int argc, char **argv) > { > new file mode 100644 > --- /dev/null > +++ b/engines/nvme.c > @@ -0,0 +1,226 @@ > +/* NVMe passthrough engine. */ > +#include <linux/nvme_ioctl.h> > + > +#include <sys/types.h> > +#include <sys/stat.h> > +#include <fcntl.h> > +#include <sys/ioctl.h> > + > +#include "../fio.h" > + > +enum { > + nvme_admin_identify = 6, > +}; > + > +enum { > + nvme_cmd_flush = 0, > + nvme_cmd_write = 1, > + nvme_cmd_read = 2, > +}; > + > +struct nvme_lbaf { > + __le16 ms; > + __u8 ds; > + __u8 rp; > +}; > + > +struct nvme_id_ns { > + __le64 nsze; > + __le64 ncap; > + __le64 nuse; > + __u8 nsfeat; > + __u8 nlbaf; > + __u8 flbas; > + __u8 mc; > + __u8 dpc; > + __u8 dps; > + __u8 nmic; > + __u8 rescap; > + __u8 fpi; > + __u8 dlfeat; > + __le16 nawun; > + __le16 nawupf; > + __le16 nacwu; > + __le16 nabsn; > + __le16 nabo; > + __le16 nabspf; > + __le16 noiob; > + __u8 nvmcap[16]; > + __le16 npwg; > + __le16 npwa; > + __le16 npdg; > + __le16 npda; > + __le16 nows; > + __u8 rsvd74[18]; > + __le32 anagrpid; > + __u8 rsvd96[3]; > + __u8 nsattr; > + __le16 nvmsetid; > + __le16 endgid; > + __u8 nguid[16]; > + __u8 eui64[8]; > + struct nvme_lbaf lbaf[16]; > + __u8 rsvd192[192]; > + __u8 vs[3712]; > +}; > + > +static inline uint32_t get_nsid(const struct fio_file *f) > +{ > + return (uintptr_t)f->engine_data; > +} > + > +static int nvme_open_file(struct thread_data *td, struct fio_file *f) > +{ > + struct nvme_admin_cmd cmd; > + struct nvme_id_ns id; > + struct stat st; > + uint32_t nsid; > + > + /* NVMe ioctls ignore open flags, require CAP_SYS_ADMIN only. */ > + f->fd = open(f->file_name, O_RDONLY); > + if (f->fd < 0) { > + return -errno; > + } > + if (fstat(f->fd, &st) == -1) { > + return -errno; > + } > + if (!S_ISBLK(st.st_mode)) { > + log_err("%s: nvme engine requires NVMe block device\n", > + f->file_name); > + return 1; > + } > + > + nsid = ioctl(f->fd, NVME_IOCTL_ID); > + if (nsid < 1) { > + log_err("%s: ioctl NVME_IOCTL_ID\n", f->file_name); > + return 1; > + } > + > + f->engine_data = (void *)(uintptr_t)nsid; > + > + /* Identify Namespace */ > + memset(&cmd, 0, sizeof(struct nvme_admin_cmd)); > + cmd.opcode = nvme_admin_identify; > + cmd.nsid = nsid; > + cmd.addr = (uintptr_t)&id; > + cmd.data_len = 4096; > + if (ioctl(f->fd, NVME_IOCTL_ADMIN_CMD, &cmd) != 0) { > + log_err("%s: ioctl NVME_IOCTL_ADMIN_CMD\n", f->file_name); > + return 1; > + } > + > + f->lba_shift = id.lbaf[id.flbas & 15].ds; > + return 0; > +} > + > +static int fio_nvme_read(struct fio_file *f, struct io_u *io_u) > +{ > + fio_unused uint32_t nsid = get_nsid(f); > + struct nvme_user_io cmd = {}; > + > + //printf("R %u %llu/%llu\n", nsid, io_u->offset, io_u->xfer_buflen); > + > + assert((io_u->xfer_buflen & ((1ULL << f->lba_shift) - 1)) == 0); > + assert((io_u->offset & ((1ULL << f->lba_shift) - 1)) == 0); > + > + cmd.opcode = nvme_cmd_read; > + cmd.nblocks = (io_u->xfer_buflen >> f->lba_shift) - 1; > + cmd.addr = (uintptr_t)io_u->xfer_buf; > + cmd.slba = io_u->offset >> f->lba_shift; > + return ioctl(f->fd, NVME_IOCTL_SUBMIT_IO, &cmd); > +} > + > +static int fio_nvme_write(struct fio_file *f, struct io_u *io_u) > +{ > + fio_unused uint32_t nsid = get_nsid(f); > + struct nvme_user_io cmd = {}; > + > + //printf("W %u %llu/%llu\n", nsid, io_u->offset, io_u->xfer_buflen); > + > + assert((io_u->xfer_buflen & ((1ULL << f->lba_shift) - 1)) == 0); > + assert((io_u->offset & ((1ULL << f->lba_shift) - 1)) == 0); > + > + cmd.opcode = nvme_cmd_write; > + cmd.nblocks = (io_u->xfer_buflen >> f->lba_shift) - 1; > + cmd.addr = (uintptr_t)io_u->xfer_buf; > + cmd.slba = io_u->offset >> f->lba_shift; > + return ioctl(f->fd, NVME_IOCTL_SUBMIT_IO, &cmd); > +} > + > +static int fio_nvme_flush(struct fio_file *f) > +{ > + uint32_t nsid = get_nsid(f); > + struct nvme_passthru_cmd cmd = {}; > + > + //printf("F %u\n", nsid); > + > + cmd.opcode = nvme_cmd_flush; > + cmd.nsid = nsid; > + return ioctl(f->fd, NVME_IOCTL_IO_CMD, &cmd); > +} > + > +static enum fio_q_status fio_nvme_queue(struct thread_data *td, struct io_u *io_u) > +{ > + struct fio_file *f = io_u->file; > + int rv; > + > + fio_ro_check(td, io_u); > + > + if (io_u->ddir == DDIR_READ) { > + // FIXME MDTS > + rv = fio_nvme_read(f, io_u); > + if (rv == 0) { > + io_u->resid = 0; > + io_u->error = 0; > + } else { > + io_u->error = rv; > + } > + } else if (io_u->ddir == DDIR_WRITE) { > + // FIXME MDTS > + rv = fio_nvme_write(f, io_u); > + if (rv == 0) { > + io_u->resid = 0; > + io_u->error = 0; > + } else { > + io_u->error = rv; > + } > + } else if (io_u->ddir == DDIR_TRIM) { > + // FIXME > + rv = io_u->xfer_buflen; > + io_u->error = EINVAL; > + } else if (io_u->ddir == DDIR_SYNC || io_u->ddir == DDIR_DATASYNC) { > + rv = fio_nvme_flush(f); > + } else { > + rv = io_u->xfer_buflen; > + io_u->error = EINVAL; > + } > + > + if (io_u->error) { > + io_u_log_error(td, io_u); > + td_verror(td, io_u->error, "xfer"); > + } > + > + return FIO_Q_COMPLETED; > +} > + > +static struct ioengine_ops ioengine = { > + .name = "nvme", > + .version = FIO_IOOPS_VERSION, > + .flags = FIO_SYNCIO|FIO_RAWIO|FIO_NOEXTEND, > + .queue = fio_nvme_queue, > + .open_file = nvme_open_file, > + .close_file = generic_close_file, > + .get_file_size = generic_get_file_size, > +}; > + > +fio_init > +static void register_nvme_ioengine(void) > +{ > + register_ioengine(&ioengine); > +} > + > +fio_exit > +static void unregister_nvme_ioengine(void) > +{ > + unregister_ioengine(&ioengine); > +} > --- a/file.h > +++ b/file.h > @@ -99,6 +99,7 @@ struct fio_file { > uint64_t real_file_size; > uint64_t file_offset; > uint64_t io_size; > + unsigned int lba_shift; > > /* > * Zoned block device information. See also zonemode=zbd. > -- Damien Le Moal Western Digital Research