Use the previously introduced NVME wrapper library for the passthrough commands from the ANA prioritizer. Discard code duplicated from nvme-cli from the ana code itself. Furthermore, make additional cleanups in the ANA prioritizer: - don't use the same enum for priorities and error codes - use char* arrays for error messages and state names - return -1 prio to libmultipath for all error cases - check for overflow in check_ana_state() - get_ana_info(): improve readability with is_anagrpid_const - priorities: PERSISTENT_LOSS state is worse than INACCESSIBLE and CHANGE Cc: lijie <lijie34@xxxxxxxxxx> Signed-off-by: Martin Wilck <mwilck@xxxxxxxx> --- libmultipath/prioritizers/Makefile | 6 +- libmultipath/prioritizers/ana.c | 334 ++++++++++++----------------- libmultipath/prioritizers/ana.h | 221 ------------------- 3 files changed, 143 insertions(+), 418 deletions(-) delete mode 100644 libmultipath/prioritizers/ana.h diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile index 15afaba3..4d80c20c 100644 --- a/libmultipath/prioritizers/Makefile +++ b/libmultipath/prioritizers/Makefile @@ -19,9 +19,13 @@ LIBS = \ libpriordac.so \ libprioweightedpath.so \ libpriopath_latency.so \ - libprioana.so \ libpriosysfs.so +ifneq ($(call check_file,/usr/include/linux/nvme_ioctl.h),0) + LIBS += libprioana.so + CFLAGS += -I../nvme +endif + all: $(LIBS) libprioalua.so: alua.o alua_rtpg.o diff --git a/libmultipath/prioritizers/ana.c b/libmultipath/prioritizers/ana.c index c5aaa5fb..6000d548 100644 --- a/libmultipath/prioritizers/ana.c +++ b/libmultipath/prioritizers/ana.c @@ -17,155 +17,82 @@ #include <sys/stat.h> #include <sys/types.h> #include <stdbool.h> +#include <libudev.h> #include "debug.h" +#include "nvme-lib.h" #include "prio.h" +#include "util.h" #include "structs.h" -#include "ana.h" enum { - ANA_PRIO_OPTIMIZED = 50, - ANA_PRIO_NONOPTIMIZED = 10, - ANA_PRIO_INACCESSIBLE = 5, - ANA_PRIO_PERSISTENT_LOSS = 1, - ANA_PRIO_CHANGE = 0, - ANA_PRIO_RESERVED = 0, - ANA_PRIO_GETCTRL_FAILED = -1, - ANA_PRIO_NOT_SUPPORTED = -2, - ANA_PRIO_GETANAS_FAILED = -3, - ANA_PRIO_GETANALOG_FAILED = -4, - ANA_PRIO_GETNSID_FAILED = -5, - ANA_PRIO_GETNS_FAILED = -6, - ANA_PRIO_NO_MEMORY = -7, - ANA_PRIO_NO_INFORMATION = -8, + ANA_ERR_GETCTRL_FAILED = 1, + ANA_ERR_NOT_NVME, + ANA_ERR_NOT_SUPPORTED, + ANA_ERR_GETANAS_OVERFLOW, + ANA_ERR_GETANAS_NOTFOUND, + ANA_ERR_GETANALOG_FAILED, + ANA_ERR_GETNSID_FAILED, + ANA_ERR_GETNS_FAILED, + ANA_ERR_NO_MEMORY, + ANA_ERR_NO_INFORMATION, }; -static const char * anas_string[] = { +static const char *ana_errmsg[] = { + [ANA_ERR_GETCTRL_FAILED] = "couldn't get ctrl info", + [ANA_ERR_NOT_NVME] = "not an NVMe device", + [ANA_ERR_NOT_SUPPORTED] = "ANA not supported", + [ANA_ERR_GETANAS_OVERFLOW] = "buffer overflow in ANA log", + [ANA_ERR_GETANAS_NOTFOUND] = "NSID or ANAGRPID not found", + [ANA_ERR_GETANALOG_FAILED] = "couldn't get ana log", + [ANA_ERR_GETNSID_FAILED] = "couldn't get NSID", + [ANA_ERR_GETNS_FAILED] = "couldn't get namespace info", + [ANA_ERR_NO_MEMORY] = "out of memory", + [ANA_ERR_NO_INFORMATION] = "invalid fd", +}; + +static const char *anas_string[] = { [NVME_ANA_OPTIMIZED] = "ANA Optimized State", [NVME_ANA_NONOPTIMIZED] = "ANA Non-Optimized State", [NVME_ANA_INACCESSIBLE] = "ANA Inaccessible State", [NVME_ANA_PERSISTENT_LOSS] = "ANA Persistent Loss State", [NVME_ANA_CHANGE] = "ANA Change state", - [NVME_ANA_RESERVED] = "Invalid namespace group state!", }; static const char *aas_print_string(int rc) { rc &= 0xff; - - switch(rc) { - case NVME_ANA_OPTIMIZED: - case NVME_ANA_NONOPTIMIZED: - case NVME_ANA_INACCESSIBLE: - case NVME_ANA_PERSISTENT_LOSS: - case NVME_ANA_CHANGE: + if (rc >= 0 && rc < ARRAY_SIZE(anas_string) && + anas_string[rc] != NULL) return anas_string[rc]; - default: - return anas_string[NVME_ANA_RESERVED]; - } - - return anas_string[NVME_ANA_RESERVED]; -} - -static int nvme_get_nsid(int fd, unsigned *nsid) -{ - static struct stat nvme_stat; - int err = fstat(fd, &nvme_stat); - if (err < 0) - return 1; - - if (!S_ISBLK(nvme_stat.st_mode)) { - condlog(0, "Error: requesting namespace-id from non-block device\n"); - return 1; - } - - *nsid = ioctl(fd, NVME_IOCTL_ID); - return 0; -} - -static int nvme_submit_admin_passthru(int fd, struct nvme_passthru_cmd *cmd) -{ - return ioctl(fd, NVME_IOCTL_ADMIN_CMD, cmd); -} - -int nvme_get_log13(int fd, __u32 nsid, __u8 log_id, __u8 lsp, __u64 lpo, - __u16 lsi, bool rae, __u32 data_len, void *data) -{ - struct nvme_admin_cmd cmd = { - .opcode = nvme_admin_get_log_page, - .nsid = nsid, - .addr = (__u64)(uintptr_t) data, - .data_len = data_len, - }; - __u32 numd = (data_len >> 2) - 1; - __u16 numdu = numd >> 16, numdl = numd & 0xffff; - - cmd.cdw10 = log_id | (numdl << 16) | (rae ? 1 << 15 : 0); - if (lsp) - cmd.cdw10 |= lsp << 8; - - cmd.cdw11 = numdu | (lsi << 16); - cmd.cdw12 = lpo; - cmd.cdw13 = (lpo >> 32); - - return nvme_submit_admin_passthru(fd, &cmd); - -} - -int nvme_identify13(int fd, __u32 nsid, __u32 cdw10, __u32 cdw11, void *data) -{ - struct nvme_admin_cmd cmd = { - .opcode = nvme_admin_identify, - .nsid = nsid, - .addr = (__u64)(uintptr_t) data, - .data_len = NVME_IDENTIFY_DATA_SIZE, - .cdw10 = cdw10, - .cdw11 = cdw11, - }; - return nvme_submit_admin_passthru(fd, &cmd); + return "invalid ANA state"; } -int nvme_identify(int fd, __u32 nsid, __u32 cdw10, void *data) +static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log, + size_t ana_log_len) { - return nvme_identify13(fd, nsid, cdw10, 0, data); -} - -int nvme_identify_ctrl(int fd, void *data) -{ - return nvme_identify(fd, 0, NVME_ID_CNS_CTRL, data); -} - -int nvme_identify_ns(int fd, __u32 nsid, void *data) -{ - return nvme_identify(fd, nsid, NVME_ID_CNS_NS, data); -} - -int nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo) -{ - __u64 lpo = 0; - - return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_ANA, rgo, lpo, 0, - true, ana_log_len, ana_log); -} - -static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log) -{ - int rc = ANA_PRIO_GETANAS_FAILED; void *base = ana_log; struct nvme_ana_rsp_hdr *hdr = base; struct nvme_ana_group_desc *ana_desc; - int offset = sizeof(struct nvme_ana_rsp_hdr); + size_t offset = sizeof(struct nvme_ana_rsp_hdr); __u32 nr_nsids; size_t nsid_buf_size; int i, j; for (i = 0; i < le16_to_cpu(hdr->ngrps); i++) { ana_desc = base + offset; + + offset += sizeof(*ana_desc); + if (offset > ana_log_len) + return -ANA_ERR_GETANAS_OVERFLOW; + nr_nsids = le32_to_cpu(ana_desc->nnsids); nsid_buf_size = nr_nsids * sizeof(__le32); - offset += sizeof(*ana_desc); + offset += nsid_buf_size; + if (offset > ana_log_len) + return -ANA_ERR_GETANAS_OVERFLOW; for (j = 0; j < nr_nsids; j++) { if (nsid == le32_to_cpu(ana_desc->nsids[j])) @@ -173,12 +100,10 @@ static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log) } if (anagrpid != 0 && anagrpid == le32_to_cpu(ana_desc->grpid)) - rc = ana_desc->state; + return ana_desc->state; - offset += nsid_buf_size; } - - return rc; + return -ANA_ERR_GETANAS_NOTFOUND; } int get_ana_info(struct path * pp, unsigned int timeout) @@ -189,104 +114,121 @@ int get_ana_info(struct path * pp, unsigned int timeout) struct nvme_id_ns ns; void *ana_log; size_t ana_log_len; + bool is_anagrpid_const; rc = nvme_identify_ctrl(pp->fd, &ctrl); - if (rc) - return ANA_PRIO_GETCTRL_FAILED; + if (rc < 0) { + log_nvme_errcode(rc, pp->dev, "nvme_identify_ctrl"); + return -ANA_ERR_GETCTRL_FAILED; + } if(!(ctrl.cmic & (1 << 3))) - return ANA_PRIO_NOT_SUPPORTED; - - rc = nvme_get_nsid(pp->fd, &nsid); - if (rc) - return ANA_PRIO_GETNSID_FAILED; + return -ANA_ERR_NOT_SUPPORTED; - rc = nvme_identify_ns(pp->fd, nsid, &ns); - if (rc) - return ANA_PRIO_GETNS_FAILED; + nsid = nvme_get_nsid(pp->fd); + if (nsid <= 0) { + log_nvme_errcode(rc, pp->dev, "nvme_get_nsid"); + return -ANA_ERR_GETNSID_FAILED; + } + is_anagrpid_const = ctrl.anacap & (1 << 6); + /* + * Code copied from nvme-cli/nvme.c. We don't need to allocate an + * [nanagrpid*mnan] array of NSIDs because each NSID can occur at most + * in one ANA group. + */ ana_log_len = sizeof(struct nvme_ana_rsp_hdr) + - le32_to_cpu(ctrl.nanagrpid) * sizeof(struct nvme_ana_group_desc); - if (!(ctrl.anacap & (1 << 6))) + le32_to_cpu(ctrl.nanagrpid) + * sizeof(struct nvme_ana_group_desc); + + if (is_anagrpid_const) { + rc = nvme_identify_ns(pp->fd, nsid, 0, &ns); + if (rc) { + log_nvme_errcode(rc, pp->dev, "nvme_identify_ns"); + return -ANA_ERR_GETNS_FAILED; + } + } else ana_log_len += le32_to_cpu(ctrl.mnan) * sizeof(__le32); ana_log = malloc(ana_log_len); if (!ana_log) - return ANA_PRIO_NO_MEMORY; - + return -ANA_ERR_NO_MEMORY; + pthread_cleanup_push(free, ana_log); rc = nvme_ana_log(pp->fd, ana_log, ana_log_len, - (ctrl.anacap & (1 << 6)) ? NVME_ANA_LOG_RGO : 0); + is_anagrpid_const ? NVME_ANA_LOG_RGO : 0); if (rc) { - free(ana_log); - return ANA_PRIO_GETANALOG_FAILED; - } - - rc = get_ana_state(nsid, le32_to_cpu(ns.anagrpid), ana_log); - if (rc < 0){ - free(ana_log); - return ANA_PRIO_GETANAS_FAILED; - } - - free(ana_log); - condlog(3, "%s: ana state = %02x [%s]", pp->dev, rc, aas_print_string(rc)); - + log_nvme_errcode(rc, pp->dev, "nvme_ana_log"); + rc = -ANA_ERR_GETANALOG_FAILED; + } else + rc = get_ana_state(nsid, + is_anagrpid_const ? + le32_to_cpu(ns.anagrpid) : 0, + ana_log, ana_log_len); + pthread_cleanup_pop(1); + if (rc >= 0) + condlog(3, "%s: ana state = %02x [%s]", pp->dev, rc, + aas_print_string(rc)); return rc; } -int getprio(struct path * pp, char * args, unsigned int timeout) +/* + * Priorities modeled roughly after the ALUA model (alua.c/sysfs.c) + * Reference: ANA Base Protocol (NVMe TP 4004a, 11/13/2018). + * + * Differences: + * + * - The ANA base spec defines no implicit or explicit (STPG) state management. + * If a state is encountered that doesn't allow normal I/O (all except + * OPTIMIZED and NON_OPTIMIZED), we can't do anything but either wait for a + * Access State Change Notice (can't do that in multipathd as we don't receive + * those), or retry commands in regular time intervals until ANATT is expired + * (not implemented). Mapping UNAVAILABLE state to ALUA STANDBY is the best we + * can currently do. + * + * FIXME: Waiting for ANATT could be implemented with a "delayed failback" + * mechanism. The current "failback" method can't be used, as it would + * affect failback to every state, and here only failback to UNAVAILABLE + * should be delayed. + * + * - PERSISTENT_LOSS state is even below ALUA's UNAVAILABLE state. + * FIXME: According to the ANA TP, accessing paths in PERSISTENT_LOSS state + * in any way makes no sense (e.g. §8.19.6 - paths in this state shouldn't + * even be checked under "all paths down" conditions). Device mapper can, + * and will, select a PG for IO if it has non-failed paths, even if the + * PG has priority 0. We could avoid that only with an "ANA path checker". + * + * - ALUA has no CHANGE state. The ANA TP §8.18.3 / §8.19.4 suggests + * that CHANGE state should be treated in roughly the same way as + * INACCESSIBLE. Therefore we assign the same prio to it. + * + * - ALUA's LBA-dependent state has no ANA equivalent. + */ + +int getprio(struct path *pp, char *args, unsigned int timeout) { int rc; if (pp->fd < 0) - return ANA_PRIO_NO_INFORMATION; + rc = -ANA_ERR_NO_INFORMATION; + else + rc = get_ana_info(pp, timeout); - rc = get_ana_info(pp, timeout); - if (rc >= 0) { - rc &= 0x0f; - switch(rc) { - case NVME_ANA_OPTIMIZED: - rc = ANA_PRIO_OPTIMIZED; - break; - case NVME_ANA_NONOPTIMIZED: - rc = ANA_PRIO_NONOPTIMIZED; - break; - case NVME_ANA_INACCESSIBLE: - rc = ANA_PRIO_INACCESSIBLE; - break; - case NVME_ANA_PERSISTENT_LOSS: - rc = ANA_PRIO_PERSISTENT_LOSS; - break; - case NVME_ANA_CHANGE: - rc = ANA_PRIO_CHANGE; - break; - default: - rc = ANA_PRIO_RESERVED; - } - } else { - switch(rc) { - case ANA_PRIO_GETCTRL_FAILED: - condlog(0, "%s: couldn't get ctrl info", pp->dev); - break; - case ANA_PRIO_NOT_SUPPORTED: - condlog(0, "%s: ana not supported", pp->dev); - break; - case ANA_PRIO_GETANAS_FAILED: - condlog(0, "%s: couldn't get ana state", pp->dev); - break; - case ANA_PRIO_GETANALOG_FAILED: - condlog(0, "%s: couldn't get ana log", pp->dev); - break; - case ANA_PRIO_GETNS_FAILED: - condlog(0, "%s: couldn't get namespace", pp->dev); - break; - case ANA_PRIO_GETNSID_FAILED: - condlog(0, "%s: couldn't get namespace id", pp->dev); - break; - case ANA_PRIO_NO_MEMORY: - condlog(0, "%s: couldn't alloc memory", pp->dev); - break; - } + switch (rc) { + case NVME_ANA_OPTIMIZED: + return 50; + case NVME_ANA_NONOPTIMIZED: + return 10; + case NVME_ANA_INACCESSIBLE: + case NVME_ANA_CHANGE: + return 1; + case NVME_ANA_PERSISTENT_LOSS: + return 0; + default: + break; } - return rc; + if (rc < 0 && -rc < ARRAY_SIZE(ana_errmsg)) + condlog(2, "%s: ANA error: %s", pp->dev, ana_errmsg[-rc]); + else + condlog(1, "%s: invalid ANA rc code %d", pp->dev, rc); + return -1; } - diff --git a/libmultipath/prioritizers/ana.h b/libmultipath/prioritizers/ana.h deleted file mode 100644 index 92cfa9e3..00000000 --- a/libmultipath/prioritizers/ana.h +++ /dev/null @@ -1,221 +0,0 @@ -#ifndef _ANA_H -#define _ANA_H - -#include <linux/types.h> - -#define NVME_NSID_ALL 0xffffffff -#define NVME_IDENTIFY_DATA_SIZE 4096 - -#define NVME_LOG_ANA 0x0c - -/* Admin commands */ -enum nvme_admin_opcode { - nvme_admin_get_log_page = 0x02, - nvme_admin_identify = 0x06, -}; - -enum { - NVME_ID_CNS_NS = 0x00, - NVME_ID_CNS_CTRL = 0x01, -}; - -/* nvme ioctl start */ -struct nvme_passthru_cmd { - __u8 opcode; - __u8 flags; - __u16 rsvd1; - __u32 nsid; - __u32 cdw2; - __u32 cdw3; - __u64 metadata; - __u64 addr; - __u32 metadata_len; - __u32 data_len; - __u32 cdw10; - __u32 cdw11; - __u32 cdw12; - __u32 cdw13; - __u32 cdw14; - __u32 cdw15; - __u32 timeout_ms; - __u32 result; -}; - -#define nvme_admin_cmd nvme_passthru_cmd - -#define NVME_IOCTL_ID _IO('N', 0x40) -#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) -/* nvme ioctl end */ - -/* nvme id ctrl start */ -struct nvme_id_power_state { - __le16 max_power; /* centiwatts */ - __u8 rsvd2; - __u8 flags; - __le32 entry_lat; /* microseconds */ - __le32 exit_lat; /* microseconds */ - __u8 read_tput; - __u8 read_lat; - __u8 write_tput; - __u8 write_lat; - __le16 idle_power; - __u8 idle_scale; - __u8 rsvd19; - __le16 active_power; - __u8 active_work_scale; - __u8 rsvd23[9]; -}; - -struct nvme_id_ctrl { - __le16 vid; - __le16 ssvid; - char sn[20]; - char mn[40]; - char fr[8]; - __u8 rab; - __u8 ieee[3]; - __u8 cmic; - __u8 mdts; - __le16 cntlid; - __le32 ver; - __le32 rtd3r; - __le32 rtd3e; - __le32 oaes; - __le32 ctratt; - __u8 rsvd100[156]; - __le16 oacs; - __u8 acl; - __u8 aerl; - __u8 frmw; - __u8 lpa; - __u8 elpe; - __u8 npss; - __u8 avscc; - __u8 apsta; - __le16 wctemp; - __le16 cctemp; - __le16 mtfa; - __le32 hmpre; - __le32 hmmin; - __u8 tnvmcap[16]; - __u8 unvmcap[16]; - __le32 rpmbs; - __le16 edstt; - __u8 dsto; - __u8 fwug; - __le16 kas; - __le16 hctma; - __le16 mntmt; - __le16 mxtmt; - __le32 sanicap; - __le32 hmminds; - __le16 hmmaxd; - __u8 rsvd338[4]; - __u8 anatt; - __u8 anacap; - __le32 anagrpmax; - __le32 nanagrpid; - __u8 rsvd352[160]; - __u8 sqes; - __u8 cqes; - __le16 maxcmd; - __le32 nn; - __le16 oncs; - __le16 fuses; - __u8 fna; - __u8 vwc; - __le16 awun; - __le16 awupf; - __u8 nvscc; - __u8 nwpc; - __le16 acwu; - __u8 rsvd534[2]; - __le32 sgls; - __le32 mnan; - __u8 rsvd544[224]; - char subnqn[256]; - __u8 rsvd1024[768]; - __le32 ioccsz; - __le32 iorcsz; - __le16 icdoff; - __u8 ctrattr; - __u8 msdbd; - __u8 rsvd1804[244]; - struct nvme_id_power_state psd[32]; - __u8 vs[1024]; -}; -/* nvme id ctrl end */ - -/* nvme id ns start */ -struct nvme_lbaf { - __le16 ms; - __u8 ds; - __u8 rp; -}; - -struct nvme_id_ns { - __le64 nsze; - __le64 ncap; - __le64 nuse; - __u8 nsfeat; - __u8 nlbaf; - __u8 flbas; - __u8 mc; - __u8 dpc; - __u8 dps; - __u8 nmic; - __u8 rescap; - __u8 fpi; - __u8 rsvd33; - __le16 nawun; - __le16 nawupf; - __le16 nacwu; - __le16 nabsn; - __le16 nabo; - __le16 nabspf; - __le16 noiob; - __u8 nvmcap[16]; - __u8 rsvd64[28]; - __le32 anagrpid; - __u8 rsvd96[3]; - __u8 nsattr; - __u8 rsvd100[4]; - __u8 nguid[16]; - __u8 eui64[8]; - struct nvme_lbaf lbaf[16]; - __u8 rsvd192[192]; - __u8 vs[3712]; -}; -/* nvme id ns end */ - -/* nvme ana start */ -enum nvme_ana_state { - NVME_ANA_OPTIMIZED = 0x01, - NVME_ANA_NONOPTIMIZED = 0x02, - NVME_ANA_INACCESSIBLE = 0x03, - NVME_ANA_PERSISTENT_LOSS = 0x04, - NVME_ANA_CHANGE = 0x0f, - NVME_ANA_RESERVED = 0x05, -}; - -struct nvme_ana_rsp_hdr { - __le64 chgcnt; - __le16 ngrps; - __le16 rsvd10[3]; -}; - -struct nvme_ana_group_desc { - __le32 grpid; - __le32 nnsids; - __le64 chgcnt; - __u8 state; - __u8 rsvd17[15]; - __le32 nsids[]; -}; - -/* flag for the log specific field of the ANA log */ -#define NVME_ANA_LOG_RGO (1 << 0) - -/* nvme ana end */ - -#endif -- 2.19.2 -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel