On 2/21/2019 9:33 AM, Gal Pressman wrote: > Add admin commands submissions/completions implementation. > > Signed-off-by: Gal Pressman <galpress@xxxxxxxxxx> > --- > drivers/infiniband/hw/efa/efa_com.c | 1184 +++++++++++++++++++++++++++++++++++ > 1 file changed, 1184 insertions(+) > create mode 100644 drivers/infiniband/hw/efa/efa_com.c > > diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c > new file mode 100644 > index 000000000000..e61f635be942 > --- /dev/null > +++ b/drivers/infiniband/hw/efa/efa_com.c > @@ -0,0 +1,1184 @@ > +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause > +/* > + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. > + */ > + > +#include "efa.h" > +#include "efa_com.h" > +#include "efa_regs_defs.h" > + > +#define ADMIN_CMD_TIMEOUT_US 30000000 /* usecs */ > + > +#define EFA_REG_READ_TIMEOUT_US 50000 /* usecs */ > +#define EFA_MMIO_READ_INVALID 0xffffffff > + > +#define EFA_POLL_INTERVAL_MS 100 /* msecs */ > + > +#define EFA_ASYNC_QUEUE_DEPTH 16 > +#define EFA_ADMIN_QUEUE_DEPTH 32 > + > +#define MIN_EFA_VER\ > + ((EFA_ADMIN_API_VERSION_MAJOR << EFA_REGS_VERSION_MAJOR_VERSION_SHIFT) | \ > + (EFA_ADMIN_API_VERSION_MINOR & EFA_REGS_VERSION_MINOR_VERSION_MASK)) > + > +#define EFA_CTRL_MAJOR 0 > +#define EFA_CTRL_MINOR 0 > +#define EFA_CTRL_SUB_MINOR 1 > + > +#define MIN_EFA_CTRL_VER \ > + (((EFA_CTRL_MAJOR) << \ > + (EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \ > + ((EFA_CTRL_MINOR) << \ > + (EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \ > + (EFA_CTRL_SUB_MINOR)) > + > +#define EFA_DMA_ADDR_TO_UINT32_LOW(x) ((u32)((u64)(x))) > +#define EFA_DMA_ADDR_TO_UINT32_HIGH(x) ((u32)(((u64)(x)) >> 32)) > + > +#define EFA_REGS_ADMIN_INTR_MASK 1 > + > +#define efa_admin_stat_add(aq, stat, val) \ > + do { \ > + typeof(aq) _aq = aq; \ > + unsigned long flags; \ > + \ > + spin_lock_irqsave(&_aq->stats_lock, flags); \ > + (stat) += val; \ > + spin_unlock_irqrestore(&_aq->stats_lock, flags); \ > + } while (0) > + > +#define efa_admin_stat_inc(aq, stat) efa_admin_stat_add(aq, stat, 1) > + > +enum efa_cmd_status { > + EFA_CMD_SUBMITTED, > + EFA_CMD_COMPLETED, > + /* Abort - canceled by the driver */ > + EFA_CMD_ABORTED, > +}; > + > +struct efa_comp_ctx { > + struct completion wait_event; > + struct efa_admin_acq_entry *user_cqe; > + u32 comp_size; > + enum efa_cmd_status status; > + /* status from the device */ > + u8 comp_status; > + u8 cmd_opcode; > + u8 occupied; > +}; > + > +static const char *efa_com_cmd_str(u8 cmd) > +{ > +#define EFA_CMD_STR_CASE(_cmd) case EFA_ADMIN_##_cmd: return #_cmd > + > + switch (cmd) { > + EFA_CMD_STR_CASE(CREATE_QP); > + EFA_CMD_STR_CASE(MODIFY_QP); > + EFA_CMD_STR_CASE(QUERY_QP); > + EFA_CMD_STR_CASE(DESTROY_QP); > + EFA_CMD_STR_CASE(CREATE_AH); > + EFA_CMD_STR_CASE(DESTROY_AH); > + EFA_CMD_STR_CASE(REG_MR); > + EFA_CMD_STR_CASE(DEREG_MR); > + EFA_CMD_STR_CASE(CREATE_CQ); > + EFA_CMD_STR_CASE(DESTROY_CQ); > + EFA_CMD_STR_CASE(GET_FEATURE); > + EFA_CMD_STR_CASE(SET_FEATURE); > + EFA_CMD_STR_CASE(GET_STATS); > + EFA_CMD_STR_CASE(ALLOC_PD); > + EFA_CMD_STR_CASE(DEALLOC_PD); > + EFA_CMD_STR_CASE(ALLOC_UAR); > + EFA_CMD_STR_CASE(DEALLOC_UAR); > + default: return "unknown command opcode"; > + } Should you #undef EFA_CMD_STR_CASE here? > +} > + > +static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset) > +{ > + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; > + struct efa_admin_mmio_req_read_less_resp *read_resp; > + unsigned long exp_time; > + u32 mmio_read_reg; > + u32 err; > + > + read_resp = mmio_read->read_resp; > + > + spin_lock(&mmio_read->lock); > + mmio_read->seq_num++; > + > + /* trash DMA req_id to identify when hardware is done */ > + read_resp->req_id = mmio_read->seq_num + 0x9aL; > + mmio_read_reg = (offset << EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) & > + EFA_REGS_MMIO_REG_READ_REG_OFF_MASK; > + mmio_read_reg |= mmio_read->seq_num & > + EFA_REGS_MMIO_REG_READ_REQ_ID_MASK; > + > + writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF); > + > + exp_time = jiffies + usecs_to_jiffies(mmio_read->mmio_read_timeout); > + do { > + if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num) > + break; > + udelay(1); > + } while (time_is_after_jiffies(exp_time)); > + > + if (unlikely(read_resp->req_id != mmio_read->seq_num)) { > + efa_err(edev->dmadev, > + "Reading register timed out. expected: req id[%u] offset[%#x] actual: req id[%u] offset[%#x]\n", > + mmio_read->seq_num, > + offset, > + read_resp->req_id, > + read_resp->reg_off); > + err = EFA_MMIO_READ_INVALID; > + goto out; > + } > + > + if (read_resp->reg_off != offset) { > + efa_err(edev->dmadev, > + "Reading register failed: wrong offset provided\n"); > + err = EFA_MMIO_READ_INVALID; > + goto out; > + } > + > + err = read_resp->reg_val; > +out: > + spin_unlock(&mmio_read->lock); > + return err; > +} > + > +static int efa_com_admin_init_sq(struct efa_com_dev *edev) > +{ > + struct efa_com_admin_queue *aq = &edev->aq; > + struct efa_com_admin_sq *sq = &aq->sq; > + u16 size = ADMIN_SQ_SIZE(aq->depth); > + u32 addr_high; > + u32 addr_low; > + u32 aq_caps; > + > + sq->entries = > + dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL); > + if (!sq->entries) > + return -ENOMEM; > + > + spin_lock_init(&sq->lock); > + > + sq->cc = 0; > + sq->pc = 0; > + sq->phase = 1; > + > + sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF); > + > + addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(sq->dma_addr); > + addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(sq->dma_addr); > + > + writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF); > + writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF); > + > + aq_caps = 0; > + aq_caps |= aq->depth & EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK; > + aq_caps |= (sizeof(struct efa_admin_aq_entry) << > + EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) & > + EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK; > + > + writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF); > + > + return 0; > +} > + > +static int efa_com_admin_init_cq(struct efa_com_dev *edev) > +{ > + struct efa_com_admin_queue *aq = &edev->aq; > + struct efa_com_admin_cq *cq = &aq->cq; > + u16 size = ADMIN_CQ_SIZE(aq->depth); > + u32 addr_high; > + u32 addr_low; > + u32 acq_caps; > + > + cq->entries = > + dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL); > + if (!cq->entries) > + return -ENOMEM; > + > + spin_lock_init(&cq->lock); > + > + cq->cc = 0; > + cq->phase = 1; > + > + addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(cq->dma_addr); > + addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(cq->dma_addr); > + > + writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF); > + writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF); > + > + acq_caps = 0; > + acq_caps |= aq->depth & EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK; > + acq_caps |= (sizeof(struct efa_admin_acq_entry) << > + EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) & > + EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK; > + acq_caps |= (aq->msix_vector_idx << > + EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT) & > + EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK; > + > + writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF); > + > + return 0; > +} > + > +static int efa_com_admin_init_aenq(struct efa_com_dev *edev, > + struct efa_aenq_handlers *aenq_handlers) > +{ > + struct efa_com_aenq *aenq = &edev->aenq; > + u32 addr_low, addr_high, aenq_caps; > + u16 size; > + > + if (unlikely(!aenq_handlers)) { > + efa_err(edev->dmadev, "aenq handlers pointer is NULL\n"); > + return -EINVAL; > + } > + > + size = ADMIN_AENQ_SIZE(EFA_ASYNC_QUEUE_DEPTH); > + aenq->entries = dma_alloc_coherent(edev->dmadev, size, &aenq->dma_addr, > + GFP_KERNEL); > + if (!aenq->entries) > + return -ENOMEM; > + > + aenq->aenq_handlers = aenq_handlers; > + aenq->depth = EFA_ASYNC_QUEUE_DEPTH; > + aenq->cc = 0; > + aenq->phase = 1; > + > + addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr); > + addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr); > + > + writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF); > + writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF); > + > + aenq_caps = 0; > + aenq_caps |= aenq->depth & EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK; > + aenq_caps |= (sizeof(struct efa_admin_aenq_entry) << > + EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) & > + EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK; > + aenq_caps |= (aenq->msix_vector_idx > + << EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT) & > + EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK; > + writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF); > + > + /* > + * Init cons_db to mark that all entries in the queue > + * are initially available > + */ > + writel(edev->aenq.cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF); > + Do you need any barrier type operation here to ensure these values have been read by HW? > + return 0; > +} > + > +/* ID to be used with efa_com_get_comp_ctx */ > +static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq) > +{ > + u16 ctx_id; > + > + spin_lock(&aq->comp_ctx_lock); > + ctx_id = aq->comp_ctx_pool[aq->comp_ctx_pool_next]; > + aq->comp_ctx_pool_next++; > + spin_unlock(&aq->comp_ctx_lock); > + > + return ctx_id; > +} > + > +static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq, > + u16 ctx_id) > +{ > + spin_lock(&aq->comp_ctx_lock); > + aq->comp_ctx_pool_next--; > + aq->comp_ctx_pool[aq->comp_ctx_pool_next] = ctx_id; > + spin_unlock(&aq->comp_ctx_lock); > +} > + > +static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq, > + struct efa_comp_ctx *comp_ctx) > +{ > + u16 comp_id = comp_ctx->user_cqe->acq_common_descriptor.command & > + EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK; > + > + efa_dbg(aq->dmadev, "Putting completion command_id %d\n", comp_id); > + comp_ctx->occupied = 0; > + efa_com_dealloc_ctx_id(aq, comp_id); > +} > + > +static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq, > + u16 command_id, bool capture) > +{ > + if (unlikely(command_id >= aq->depth)) { > + efa_err(aq->dmadev, > + "command id is larger than the queue size. cmd_id: %u queue size %d\n", > + command_id, aq->depth); > + return NULL; > + } > + > + if (unlikely(aq->comp_ctx[command_id].occupied && capture)) { > + efa_err(aq->dmadev, "Completion context is occupied\n"); > + return NULL; > + } > + > + if (capture) { > + aq->comp_ctx[command_id].occupied = 1; > + efa_dbg(aq->dmadev, > + "Taking completion ctxt command_id %d\n", > + command_id); > + } > + > + return &aq->comp_ctx[command_id]; > +} > + > +static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq, > + struct efa_admin_aq_entry *cmd, > + size_t cmd_size_in_bytes, > + struct efa_admin_acq_entry *comp, > + size_t comp_size_in_bytes) > +{ > + struct efa_comp_ctx *comp_ctx; > + u16 queue_size_mask; > + u16 ctx_id; > + u16 pi; > + > + queue_size_mask = aq->depth - 1; > + pi = aq->sq.pc & queue_size_mask; > + > + ctx_id = efa_com_alloc_ctx_id(aq); > + > + cmd->aq_common_descriptor.flags |= aq->sq.phase & > + EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK; > + > + cmd->aq_common_descriptor.command_id |= ctx_id & > + EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; > + > + comp_ctx = efa_com_get_comp_ctx(aq, ctx_id, true); > + if (unlikely(!comp_ctx)) { > + efa_com_dealloc_ctx_id(aq, ctx_id); > + return ERR_PTR(-EINVAL); > + } > + > + comp_ctx->status = EFA_CMD_SUBMITTED; > + comp_ctx->comp_size = comp_size_in_bytes; > + comp_ctx->user_cqe = comp; > + comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode; > + > + reinit_completion(&comp_ctx->wait_event); > + > + memcpy(&aq->sq.entries[pi], cmd, cmd_size_in_bytes); > + > + aq->sq.pc++; > + efa_admin_stat_inc(aq, aq->stats.submitted_cmd); > + > + if (unlikely((aq->sq.pc & queue_size_mask) == 0)) > + aq->sq.phase = !aq->sq.phase; > + > + /* barrier not needed in case of writel */ > + writel(aq->sq.pc, aq->sq.db_addr); > + > + return comp_ctx; > +} > + > +static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq) > +{ > + size_t pool_size = aq->depth * sizeof(*aq->comp_ctx_pool); > + size_t size = aq->depth * sizeof(struct efa_comp_ctx); > + struct efa_comp_ctx *comp_ctx; > + u16 i; > + > + aq->comp_ctx = devm_kzalloc(aq->dmadev, size, GFP_KERNEL); > + aq->comp_ctx_pool = devm_kzalloc(aq->dmadev, pool_size, GFP_KERNEL); > + if (unlikely(!aq->comp_ctx || !aq->comp_ctx_pool)) { > + devm_kfree(aq->dmadev, aq->comp_ctx_pool); > + devm_kfree(aq->dmadev, aq->comp_ctx); > + return -ENOMEM; > + } > + > + for (i = 0; i < aq->depth; i++) { > + comp_ctx = efa_com_get_comp_ctx(aq, i, false); > + if (comp_ctx) > + init_completion(&comp_ctx->wait_event); > + > + aq->comp_ctx_pool[i] = i; > + } > + > + spin_lock_init(&aq->comp_ctx_lock); > + > + aq->comp_ctx_pool_next = 0; > + > + return 0; > +} > + > +static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq, > + struct efa_admin_aq_entry *cmd, > + size_t cmd_size_in_bytes, > + struct efa_admin_acq_entry *comp, > + size_t comp_size_in_bytes) > +{ > + struct efa_comp_ctx *comp_ctx; > + > + spin_lock(&aq->sq.lock); > + if (unlikely(!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state))) { > + efa_err(aq->dmadev, "Admin queue is closed\n"); > + spin_unlock(&aq->sq.lock); > + return ERR_PTR(-ENODEV); > + } > + > + comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp, > + comp_size_in_bytes); > + spin_unlock(&aq->sq.lock); > + if (unlikely(IS_ERR(comp_ctx))) > + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); > + > + return comp_ctx; > +} > + > +static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, > + struct efa_admin_acq_entry *cqe) > +{ > + struct efa_comp_ctx *comp_ctx; > + u16 cmd_id; > + > + cmd_id = cqe->acq_common_descriptor.command & > + EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK; > + > + comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); > + if (unlikely(!comp_ctx)) { > + efa_err(aq->dmadev, > + "comp_ctx is NULL. Changing the admin queue running state\n"); > + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); > + return; > + } > + > + comp_ctx->status = EFA_CMD_COMPLETED; > + comp_ctx->comp_status = cqe->acq_common_descriptor.status; > + if (comp_ctx->user_cqe) > + memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size); > + > + if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) > + complete(&comp_ctx->wait_event); > +} > + > +static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) > +{ > + struct efa_admin_acq_entry *cqe; > + u16 queue_size_mask; > + u16 comp_num = 0; > + u8 phase; > + u16 ci; > + > + queue_size_mask = aq->depth - 1; > + > + ci = aq->cq.cc & queue_size_mask; > + phase = aq->cq.phase; > + > + cqe = &aq->cq.entries[ci]; > + > + /* Go over all the completions */ > + while ((READ_ONCE(cqe->acq_common_descriptor.flags) & > + EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) { > + /* > + * Do not read the rest of the completion entry before the > + * phase bit was validated > + */ > + dma_rmb(); > + efa_com_handle_single_admin_completion(aq, cqe); > + > + ci++; > + comp_num++; > + if (unlikely(ci == aq->depth)) { > + ci = 0; > + phase = !phase; > + } > + > + cqe = &aq->cq.entries[ci]; > + } > + > + aq->cq.cc += comp_num; > + aq->cq.phase = phase; > + aq->sq.cc += comp_num; > + efa_admin_stat_add(aq, aq->stats.completed_cmd, comp_num); > +} > + > +static int efa_com_comp_status_to_errno(struct efa_com_admin_queue *aq, > + u8 comp_status) > +{ > + switch (comp_status) { > + case EFA_ADMIN_SUCCESS: > + return 0; > + case EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE: > + return -ENOMEM; > + case EFA_ADMIN_UNSUPPORTED_OPCODE: > + return -EOPNOTSUPP; > + case EFA_ADMIN_BAD_OPCODE: > + case EFA_ADMIN_MALFORMED_REQUEST: > + case EFA_ADMIN_ILLEGAL_PARAMETER: > + case EFA_ADMIN_UNKNOWN_ERROR: > + return -EINVAL; > + default: > + return -EINVAL; > + } > +} > + > +static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_ctx, > + struct efa_com_admin_queue *aq) > +{ > + unsigned long timeout; > + unsigned long flags; > + int err; > + > + timeout = jiffies + usecs_to_jiffies(aq->completion_timeout); > + > + while (1) { > + spin_lock_irqsave(&aq->cq.lock, flags); > + efa_com_handle_admin_completion(aq); > + spin_unlock_irqrestore(&aq->cq.lock, flags); > + > + if (comp_ctx->status != EFA_CMD_SUBMITTED) > + break; > + > + if (time_is_before_jiffies(timeout)) { > + efa_err(aq->dmadev, > + "Wait for completion (polling) timeout\n"); > + /* EFA didn't have any completion */ > + efa_admin_stat_inc(aq, aq->stats.no_completion); > + > + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); > + err = -ETIME; > + goto out; > + } > + > + msleep(aq->poll_interval); > + } > + > + if (unlikely(comp_ctx->status == EFA_CMD_ABORTED)) { > + efa_err(aq->dmadev, "Command was aborted\n"); > + efa_admin_stat_inc(aq, aq->stats.aborted_cmd); > + err = -ENODEV; > + goto out; > + } > + > + WARN(comp_ctx->status != EFA_CMD_COMPLETED, "Invalid comp status %d\n", > + comp_ctx->status); > + Should the WARN() be rate limited or maybe WARN_ONCE()? Or perhaps just a pr_warn(). > + err = efa_com_comp_status_to_errno(aq, comp_ctx->comp_status); > +out: > + efa_com_put_comp_ctx(aq, comp_ctx); > + return err; > +} > + > +static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx, > + struct efa_com_admin_queue *aq) > +{ > + unsigned long flags; > + int err; > + > + wait_for_completion_timeout(&comp_ctx->wait_event, > + usecs_to_jiffies(aq->completion_timeout)); > + > + /* > + * In case the command wasn't completed find out the root cause. > + * There might be 2 kinds of errors > + * 1) No completion (timeout reached) > + * 2) There is completion but the device didn't get any msi-x interrupt. > + */ > + if (unlikely(comp_ctx->status == EFA_CMD_SUBMITTED)) { > + spin_lock_irqsave(&aq->cq.lock, flags); > + efa_com_handle_admin_completion(aq); > + spin_unlock_irqrestore(&aq->cq.lock, flags); > + > + efa_admin_stat_inc(aq, aq->stats.no_completion); > + > + if (comp_ctx->status == EFA_CMD_COMPLETED) > + efa_err(aq->dmadev, > + "The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", > + efa_com_cmd_str(comp_ctx->cmd_opcode), > + comp_ctx->cmd_opcode, comp_ctx->status, > + comp_ctx, aq->sq.pc, > + aq->sq.cc, aq->cq.cc); > + else > + efa_err(aq->dmadev, > + "The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", > + efa_com_cmd_str(comp_ctx->cmd_opcode), > + comp_ctx->cmd_opcode, comp_ctx->status, > + comp_ctx, aq->sq.pc, > + aq->sq.cc, aq->cq.cc); > + > + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); > + err = -ETIME; > + goto out; > + } > + > + err = efa_com_comp_status_to_errno(aq, comp_ctx->comp_status); > +out: > + efa_com_put_comp_ctx(aq, comp_ctx); > + return err; > +} > + > +/* > + * There are two types to wait for completion. > + * Polling mode - wait until the completion is available. > + * Async mode - wait on wait queue until the completion is ready > + * (or the timeout expired). > + * It is expected that the IRQ called efa_com_handle_admin_completion > + * to mark the completions. > + */ > +static int efa_com_wait_and_process_admin_cq(struct efa_comp_ctx *comp_ctx, > + struct efa_com_admin_queue *aq) > +{ > + if (test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) > + return efa_com_wait_and_process_admin_cq_polling(comp_ctx, aq); > + > + return efa_com_wait_and_process_admin_cq_interrupts(comp_ctx, aq); > +} > + > +/* > + * efa_com_cmd_exec - Execute admin command > + * @aq: admin queue. > + * @cmd: the admin command to execute. > + * @cmd_size: the command size. > + * @comp: command completion return entry. > + * @comp_size: command completion size. > + * Submit an admin command and then wait until the device will return a > + * completion. > + * The completion will be copied into comp. > + * > + * @return - 0 on success, negative value on failure. > + */ Are all these function comments proper kdoc or whatever format? I thought it should be /** to start? Or maybe it doesn't matter? > +int efa_com_cmd_exec(struct efa_com_admin_queue *aq, > + struct efa_admin_aq_entry *cmd, > + size_t cmd_size, > + struct efa_admin_acq_entry *comp, > + size_t comp_size) > +{ > + struct efa_comp_ctx *comp_ctx; > + int err; > + > + might_sleep(); > + > + /* In case of queue FULL */ > + down(&aq->avail_cmds); > + > + efa_dbg(aq->dmadev, "%s (opcode %d)\n", > + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), > + cmd->aq_common_descriptor.opcode); > + comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size); > + if (unlikely(IS_ERR(comp_ctx))) { > + efa_err(aq->dmadev, > + "Failed to submit command %s (opcode %u) err %ld\n", > + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), > + cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx)); > + > + up(&aq->avail_cmds); > + return PTR_ERR(comp_ctx); > + } > + > + err = efa_com_wait_and_process_admin_cq(comp_ctx, aq); > + if (unlikely(err)) > + efa_err(aq->dmadev, > + "Failed to process command %s (opcode %u) comp_status %d err %d\n", > + efa_com_cmd_str(cmd->aq_common_descriptor.opcode), > + cmd->aq_common_descriptor.opcode, comp_ctx->comp_status, > + err); > + > + up(&aq->avail_cmds); > + > + return err; > +} > + > +/* > + * efa_com_abort_admin_commands - Abort all the outstanding admin commands. > + * @edev: EFA communication layer struct > + * > + * This method aborts all the outstanding admin commands. > + * The caller should then call efa_com_wait_for_abort_completion to make sure > + * all the commands were completed. > + */ > +static void efa_com_abort_admin_commands(struct efa_com_dev *edev) > +{ > + struct efa_com_admin_queue *aq = &edev->aq; > + struct efa_comp_ctx *comp_ctx; > + unsigned long flags; > + u16 i; > + > + spin_lock(&aq->sq.lock); > + spin_lock_irqsave(&aq->cq.lock, flags); > + for (i = 0; i < aq->depth; i++) { > + comp_ctx = efa_com_get_comp_ctx(aq, i, false); > + if (unlikely(!comp_ctx)) > + break; > + > + comp_ctx->status = EFA_CMD_ABORTED; > + > + complete(&comp_ctx->wait_event); > + } > + spin_unlock_irqrestore(&aq->cq.lock, flags); > + spin_unlock(&aq->sq.lock); > +} > + > +/* > + * efa_com_wait_for_abort_completion - Wait for admin commands abort. > + * @edev: EFA communication layer struct > + * > + * This method wait until all the outstanding admin commands will be completed. > + */ > +static void efa_com_wait_for_abort_completion(struct efa_com_dev *edev) > +{ > + struct efa_com_admin_queue *aq = &edev->aq; > + int i; > + > + /* all mine */ > + for (i = 0; i < aq->depth; i++) > + down(&aq->avail_cmds); > + > + /* let it go */ > + for (i = 0; i < aq->depth; i++) > + up(&aq->avail_cmds); > +} > + > +static void efa_com_admin_flush(struct efa_com_dev *edev) > +{ > + struct efa_com_admin_queue *aq = &edev->aq; > + > + clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); > + > + efa_com_abort_admin_commands(edev); > + efa_com_wait_for_abort_completion(edev); > +} > + > +/* > + * efa_com_admin_destroy - Destroy the admin and the async events queues. > + * @edev: EFA communication layer struct > + */ > +void efa_com_admin_destroy(struct efa_com_dev *edev) > +{ > + struct efa_com_admin_queue *aq = &edev->aq; > + struct efa_com_aenq *aenq = &edev->aenq; > + struct efa_com_admin_cq *cq = &aq->cq; > + struct efa_com_admin_sq *sq = &aq->sq; > + u16 size; > + > + efa_com_admin_flush(edev); > + > + devm_kfree(edev->dmadev, aq->comp_ctx_pool); > + devm_kfree(edev->dmadev, aq->comp_ctx); > + > + size = ADMIN_SQ_SIZE(aq->depth); > + dma_free_coherent(edev->dmadev, size, sq->entries, sq->dma_addr); > + > + size = ADMIN_CQ_SIZE(aq->depth); > + dma_free_coherent(edev->dmadev, size, cq->entries, cq->dma_addr); > + > + size = ADMIN_AENQ_SIZE(aenq->depth); > + dma_free_coherent(edev->dmadev, size, aenq->entries, aenq->dma_addr); > +} > + > +/* > + * efa_com_set_admin_polling_mode - Set the admin completion queue polling mode > + * @edev: EFA communication layer struct > + * @polling: Enable/Disable polling mode > + * > + * Set the admin completion mode. > + */ > +void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling) > +{ > + u32 mask_value = 0; > + > + if (polling) > + mask_value = EFA_REGS_ADMIN_INTR_MASK; > + > + writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF); > + if (polling) > + set_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state); > + else > + clear_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state); > +} > + > +/* > + * efa_com_admin_init - Init the admin and the async queues > + * @edev: EFA communication layer struct > + * @aenq_handlers: Those handlers to be called upon event. > + * > + * Initialize the admin submission and completion queues. > + * Initialize the asynchronous events notification queues. > + * > + * @return - 0 on success, negative value on failure. > + */ > +int efa_com_admin_init(struct efa_com_dev *edev, > + struct efa_aenq_handlers *aenq_handlers) > +{ > + struct efa_com_admin_queue *aq = &edev->aq; > + u32 timeout; > + u32 dev_sts; > + u32 cap; > + int err; > + > + dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); > + if (!(dev_sts & EFA_REGS_DEV_STS_READY_MASK)) { > + efa_err(edev->dmadev, > + "Device isn't ready, abort com init 0x%08x\n", > + dev_sts); > + return -ENODEV; > + } > + > + aq->depth = EFA_ADMIN_QUEUE_DEPTH; > + > + aq->dmadev = edev->dmadev; > + set_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state); > + > + sema_init(&aq->avail_cmds, aq->depth); > + spin_lock_init(&aq->stats_lock); > + > + err = efa_com_init_comp_ctxt(aq); > + if (err) > + return err; > + > + err = efa_com_admin_init_sq(edev); > + if (err) > + goto err_destroy_comp_ctxt; > + > + err = efa_com_admin_init_cq(edev); > + if (err) > + goto err_destroy_sq; > + > + efa_com_set_admin_polling_mode(edev, false); > + > + err = efa_com_admin_init_aenq(edev, aenq_handlers); > + if (err) > + goto err_destroy_cq; > + > + cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); > + timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >> > + EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT; > + if (timeout) > + /* the resolution of timeout reg is 100ms */ > + aq->completion_timeout = timeout * 100000; > + else > + aq->completion_timeout = ADMIN_CMD_TIMEOUT_US; > + > + aq->poll_interval = EFA_POLL_INTERVAL_MS; > + > + set_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); > + > + return 0; > + > +err_destroy_cq: > + dma_free_coherent(edev->dmadev, ADMIN_CQ_SIZE(aq->depth), > + aq->cq.entries, aq->cq.dma_addr); > +err_destroy_sq: > + dma_free_coherent(edev->dmadev, ADMIN_SQ_SIZE(aq->depth), > + aq->sq.entries, aq->sq.dma_addr); > +err_destroy_comp_ctxt: > + devm_kfree(edev->dmadev, aq->comp_ctx); > + > + return err; > +} > + > +/* > + * efa_com_admin_q_comp_intr_handler - admin queue interrupt handler > + * @edev: EFA communication layer struct > + * > + * This method go over the admin completion queue and wake up all the pending > + * threads that wait on the commands wait event. > + * > + * @note: Should be called after MSI-X interrupt. > + */ > +void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&edev->aq.cq.lock, flags); > + efa_com_handle_admin_completion(&edev->aq); > + spin_unlock_irqrestore(&edev->aq.cq.lock, flags); > +} > + > +/* > + * efa_handle_specific_aenq_event: > + * return the handler that is relevant to the specific event group > + */ > +static efa_aenq_handler efa_com_get_specific_aenq_cb(struct efa_com_dev *edev, > + u16 group) > +{ > + struct efa_aenq_handlers *aenq_handlers = edev->aenq.aenq_handlers; > + > + if (group < EFA_MAX_HANDLERS && aenq_handlers->handlers[group]) > + return aenq_handlers->handlers[group]; > + > + return aenq_handlers->unimplemented_handler; > +} > + > +/* > + * efa_com_aenq_intr_handler - AENQ interrupt handler > + * @edev: EFA communication layer struct > + * > + * Go over the async event notification queue and call the proper aenq handler. > + */ > +void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data) > +{ > + struct efa_admin_aenq_common_desc *aenq_common; > + struct efa_com_aenq *aenq = &edev->aenq; > + struct efa_admin_aenq_entry *aenq_e; > + efa_aenq_handler handler_cb; > + u32 processed = 0; > + u8 phase; > + u32 ci; > + > + ci = aenq->cc & (aenq->depth - 1); > + phase = aenq->phase; > + aenq_e = &aenq->entries[ci]; /* Get first entry */ > + aenq_common = &aenq_e->aenq_common_desc; > + > + /* Go over all the events */ > + while ((READ_ONCE(aenq_common->flags) & > + EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) { > + /* > + * Do not read the rest of the completion entry before the > + * phase bit was validated > + */ > + dma_rmb(); > + > + /* Handle specific event*/ > + handler_cb = efa_com_get_specific_aenq_cb(edev, > + aenq_common->group); > + handler_cb(data, aenq_e); /* call the actual event handler*/ > + > + /* Get next event entry */ > + ci++; > + processed++; > + > + if (unlikely(ci == aenq->depth)) { > + ci = 0; > + phase = !phase; > + } > + aenq_e = &aenq->entries[ci]; > + aenq_common = &aenq_e->aenq_common_desc; > + } > + > + aenq->cc += processed; > + aenq->phase = phase; > + > + /* Don't update aenq doorbell if there weren't any processed events */ > + if (!processed) > + return; > + > + /* barrier not needed in case of writel */ > + writel(aenq->cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF); > +} > + > +static void efa_com_mmio_reg_read_resp_addr_init(struct efa_com_dev *edev) > +{ > + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; > + u32 addr_high; > + u32 addr_low; > + > + /* dma_addr_bits is unknown at this point */ > + addr_high = (mmio_read->read_resp_dma_addr >> 32) & GENMASK(31, 0); > + addr_low = mmio_read->read_resp_dma_addr & GENMASK(31, 0); > + > + writel(addr_high, edev->reg_bar + EFA_REGS_MMIO_RESP_HI_OFF); > + writel(addr_low, edev->reg_bar + EFA_REGS_MMIO_RESP_LO_OFF); > +} > + > +int efa_com_mmio_reg_read_init(struct efa_com_dev *edev) > +{ > + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; > + > + spin_lock_init(&mmio_read->lock); > + mmio_read->read_resp = > + dma_alloc_coherent(edev->dmadev, sizeof(*mmio_read->read_resp), > + &mmio_read->read_resp_dma_addr, GFP_KERNEL); > + if (unlikely(!mmio_read->read_resp)) > + return -ENOMEM; > + > + efa_com_mmio_reg_read_resp_addr_init(edev); > + > + mmio_read->read_resp->req_id = 0; > + mmio_read->seq_num = 0; > + mmio_read->mmio_read_timeout = EFA_REG_READ_TIMEOUT_US; > + > + return 0; > +} > + > +void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev) > +{ > + struct efa_com_mmio_read *mmio_read = &edev->mmio_read; > + > + /* just in case someone is still spinning on a read */ > + spin_lock(&mmio_read->lock); > + dma_free_coherent(edev->dmadev, sizeof(*mmio_read->read_resp), > + mmio_read->read_resp, mmio_read->read_resp_dma_addr); > + spin_unlock(&mmio_read->lock); > +} If someone can be spinning on a read, then can they actually try to grab this lock -after- the lock is grabbed here? If so, then when the read thread does acquire the lock, the read will be accessing freed memory. > + > +/* > + * efa_com_validate_version - Validate the device parameters > + * @edev: EFA communication layer struct > + * > + * This method validate the device parameters are the same as the saved > + * parameters in edev. > + * This method is useful after device reset, to validate the device mac address > + * and the device offloads are the same as before the reset. > + * > + * @return - 0 on success negative value otherwise. > + */ > +int efa_com_validate_version(struct efa_com_dev *edev) > +{ > + u32 ctrl_ver_masked; > + u32 ctrl_ver; > + u32 ver; > + > + /* > + * Make sure the EFA version and the controller version are at least > + * as the driver expects > + */ > + ver = efa_com_reg_read32(edev, EFA_REGS_VERSION_OFF); > + ctrl_ver = efa_com_reg_read32(edev, > + EFA_REGS_CONTROLLER_VERSION_OFF); > + > + efa_info(edev->dmadev, > + "efa device version: %d.%d\n", > + (ver & EFA_REGS_VERSION_MAJOR_VERSION_MASK) >> > + EFA_REGS_VERSION_MAJOR_VERSION_SHIFT, > + ver & EFA_REGS_VERSION_MINOR_VERSION_MASK); > + > + if (ver < MIN_EFA_VER) { > + efa_err(edev->dmadev, > + "EFA version is lower than the minimal version the driver supports\n"); > + return -EOPNOTSUPP; > + } > + > + efa_info(edev->dmadev, > + "efa controller version: %d.%d.%d implementation version %d\n", > + (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) > + >> EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT, > + (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) > + >> EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT, > + (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK), > + (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >> > + EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT); > + > + ctrl_ver_masked = > + (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) | > + (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) | > + (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK); > + > + /* Validate the ctrl version without the implementation ID */ > + if (ctrl_ver_masked < MIN_EFA_CTRL_VER) { > + efa_err(edev->dmadev, > + "EFA ctrl version is lower than the minimal ctrl version the driver supports\n"); > + return -EOPNOTSUPP; > + } > + > + return 0; > +} > + > +/* > + * efa_com_get_dma_width - Retrieve physical dma address width the device > + * supports. > + * @edev: EFA communication layer struct > + * > + * Retrieve the maximum physical address bits the device can handle. > + * > + * @return: > 0 on Success and negative value otherwise. > + */ > +int efa_com_get_dma_width(struct efa_com_dev *edev) > +{ > + u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); > + int width; > + > + width = (caps & EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >> > + EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT; > + > + efa_dbg(edev->dmadev, "DMA width: %d\n", width); > + > + if (width < 32 || width > 64) { > + efa_err(edev->dmadev, "DMA width illegal value: %d\n", width); > + return -EINVAL; > + } > + > + edev->dma_addr_bits = width; > + > + return width; > +} > + > +static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout, > + u16 exp_state) > +{ > + u32 val, i; > + > + for (i = 0; i < timeout; i++) { > + val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); > + > + if ((val & EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) == > + exp_state) > + return 0; > + > + efa_dbg(edev->dmadev, "Reset indication val %d\n", val); > + msleep(EFA_POLL_INTERVAL_MS); > + } > + > + return -ETIME; > +} > + > +/* > + * efa_com_dev_reset - Perform device FLR to the device. > + * @edev: EFA communication layer struct > + * @reset_reason: Specify what is the trigger for the reset in case of an error. > + * > + * @return - 0 on success, negative value on failure. > + */ > +int efa_com_dev_reset(struct efa_com_dev *edev, > + enum efa_regs_reset_reason_types reset_reason) > +{ > + u32 stat, timeout, cap, reset_val; > + int err; > + > + stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF); > + cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF); > + > + if (!(stat & EFA_REGS_DEV_STS_READY_MASK)) { > + efa_err(edev->dmadev, "Device isn't ready, can't reset device\n"); > + return -EINVAL; > + } > + > + timeout = (cap & EFA_REGS_CAPS_RESET_TIMEOUT_MASK) >> > + EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT; > + if (!timeout) { > + efa_err(edev->dmadev, "Invalid timeout value\n"); > + return -EINVAL; > + } > + > + /* start reset */ > + reset_val = EFA_REGS_DEV_CTL_DEV_RESET_MASK; > + reset_val |= (reset_reason << EFA_REGS_DEV_CTL_RESET_REASON_SHIFT) & > + EFA_REGS_DEV_CTL_RESET_REASON_MASK; > + writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF); > + > + /* reset clears the mmio readless address, restore it */ > + efa_com_mmio_reg_read_resp_addr_init(edev); > + > + err = wait_for_reset_state(edev, timeout, > + EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK); > + if (err) { > + efa_err(edev->dmadev, "Reset indication didn't turn on\n"); > + return err; > + } > + > + /* reset done */ > + writel(0, edev->reg_bar + EFA_REGS_DEV_CTL_OFF); > + err = wait_for_reset_state(edev, timeout, 0); > + if (err) { > + efa_err(edev->dmadev, "Reset indication didn't turn off\n"); > + return err; > + } > + > + timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >> > + EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT; > + if (timeout) > + /* the resolution of timeout reg is 100ms */ > + edev->aq.completion_timeout = timeout * 100000; > + else > + edev->aq.completion_timeout = ADMIN_CMD_TIMEOUT_US; > + > + return 0; > +} Reviewed-by: Steve Wise <swise@xxxxxxxxxxxxxxxxxxxxx>