* Mpt3sas driver uses the NVMe Encapsulated Request message to send an NVMe command to an NVMe device attached to the IOC. * Normal I/O commands like reads and writes are passed to the controller as SCSI commands and the controller has the ability to translate the commands to NVMe equivalent. * This encapsulated NVMe command is used by applications to send direct NVMe commands to NVMe drives. Signed-off-by: Chaitra P B <chaitra.basappa@xxxxxxxxxxxx> Signed-off-by: Suganath Prabu S <suganath-prabu.subramani@xxxxxxxxxxxx> --- drivers/scsi/mpt3sas/mpt3sas_base.c | 276 +++++++++++++++++++++++++++++++++++- drivers/scsi/mpt3sas/mpt3sas_base.h | 4 + drivers/scsi/mpt3sas/mpt3sas_ctl.c | 69 ++++++++- 3 files changed, 342 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 1ad3cbb..b0a75c6 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -557,6 +557,11 @@ _base_sas_ioc_info(struct MPT3SAS_ADAPTER *ioc, MPI2DefaultReply_t *mpi_reply, frame_sz = sizeof(Mpi2SmpPassthroughRequest_t) + ioc->sge_size; func_str = "smp_passthru"; break; + case MPI2_FUNCTION_NVME_ENCAPSULATED: + frame_sz = sizeof(Mpi26NVMeEncapsulatedRequest_t) + + ioc->sge_size; + func_str = "nvme_encapsulated"; + break; default: frame_sz = 32; func_str = "unknown"; @@ -985,7 +990,9 @@ _base_interrupt(int irq, void *bus_id) if (request_desript_type == MPI25_RPY_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO_SUCCESS || request_desript_type == - MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS) { + MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS || + request_desript_type == + MPI26_RPY_DESCRIPT_FLAGS_PCIE_ENCAPSULATED_SUCCESS) { cb_idx = _base_get_cb_idx(ioc, smid); if ((likely(cb_idx < MPT_MAX_CALLBACKS)) && (likely(mpt_callbacks[cb_idx] != NULL))) { @@ -1345,6 +1352,225 @@ _base_build_sg(struct MPT3SAS_ADAPTER *ioc, void *psge, } } +/* IEEE format sgls */ + +/** + * _base_build_nvme_prp - This function is called for NVMe end devices to build + * a native SGL (NVMe PRP). The native SGL is built starting in the first PRP + * entry of the NVMe message (PRP1). If the data buffer is small enough to be + * described entirely using PRP1, then PRP2 is not used. If needed, PRP2 is + * used to describe a larger data buffer. If the data buffer is too large to + * describe using the two PRP entriess inside the NVMe message, then PRP1 + * describes the first data memory segment, and PRP2 contains a pointer to a PRP + * list located elsewhere in memory to describe the remaining data memory + * segments. The PRP list will be contiguous. + + * The native SGL for NVMe devices is a Physical Region Page (PRP). A PRP + * consists of a list of PRP entries to describe a number of noncontigous + * physical memory segments as a single memory buffer, just as a SGL does. Note + * however, that this function is only used by the IOCTL call, so the memory + * given will be guaranteed to be contiguous. There is no need to translate + * non-contiguous SGL into a PRP in this case. All PRPs will describe + * contiguous space that is one page size each. + * + * Each NVMe message contains two PRP entries. The first (PRP1) either contains + * a PRP list pointer or a PRP element, depending upon the command. PRP2 + * contains the second PRP element if the memory being described fits within 2 + * PRP entries, or a PRP list pointer if the PRP spans more than two entries. + * + * A PRP list pointer contains the address of a PRP list, structured as a linear + * array of PRP entries. Each PRP entry in this list describes a segment of + * physical memory. + * + * Each 64-bit PRP entry comprises an address and an offset field. The address + * always points at the beginning of a 4KB physical memory page, and the offset + * describes where within that 4KB page the memory segment begins. Only the + * first element in a PRP list may contain a non-zero offest, implying that all + * memory segments following the first begin at the start of a 4KB page. + * + * Each PRP element normally describes 4KB of physical memory, with exceptions + * for the first and last elements in the list. If the memory being described + * by the list begins at a non-zero offset within the first 4KB page, then the + * first PRP element will contain a non-zero offset indicating where the region + * begins within the 4KB page. The last memory segment may end before the end + * of the 4KB segment, depending upon the overall size of the memory being + * described by the PRP list. + * + * Since PRP entries lack any indication of size, the overall data buffer length + * is used to determine where the end of the data memory buffer is located, and + * how many PRP entries are required to describe it. + * + * @ioc: per adapter object + * @smid: system request message index for getting asscociated SGL + * @nvme_encap_request: the NVMe request msg frame pointer + * @data_out_dma: physical address for WRITES + * @data_out_sz: data xfer size for WRITES + * @data_in_dma: physical address for READS + * @data_in_sz: data xfer size for READS + * + * Returns nothing. + */ +static void +_base_build_nvme_prp(struct MPT3SAS_ADAPTER *ioc, u16 smid, + Mpi26NVMeEncapsulatedRequest_t *nvme_encap_request, + dma_addr_t data_out_dma, size_t data_out_sz, dma_addr_t data_in_dma, + size_t data_in_sz) +{ + int prp_size = NVME_PRP_SIZE; + u64 *prp_entry, *prp1_entry, *prp2_entry, *prp_entry_phys; + u64 *prp_page, *prp_page_phys; + u32 offset, entry_len; + u32 page_mask_result, page_mask; + dma_addr_t paddr; + size_t length; + + /* + * Not all commands require a data transfer. If no data, just return + * without constructing any PRP. + */ + if (!data_in_sz && !data_out_sz) + return; + /* + * Set pointers to PRP1 and PRP2, which are in the NVMe command. + * PRP1 is located at a 24 byte offset from the start of the NVMe + * command. Then set the current PRP entry pointer to PRP1. + */ + prp1_entry = (u64 *)(nvme_encap_request->NVMe_Command + + NVME_CMD_PRP1_OFFSET); + prp2_entry = (u64 *)(nvme_encap_request->NVMe_Command + + NVME_CMD_PRP2_OFFSET); + prp_entry = prp1_entry; + /* + * For the PRP entries, use the specially allocated buffer of + * contiguous memory. + */ + prp_page = (u64 *)mpt3sas_base_get_pcie_sgl(ioc, smid); + prp_page_phys = (u64 *)mpt3sas_base_get_pcie_sgl_dma(ioc, smid); + + /* + * Check if we are within 1 entry of a page boundary we don't + * want our first entry to be a PRP List entry. + */ + page_mask = ioc->page_size - 1; + page_mask_result = (uintptr_t)((u8 *)prp_page + prp_size) & page_mask; + if (!page_mask_result) { + /* Bump up to next page boundary. */ + prp_page = (u64 *)((u8 *)prp_page + prp_size); + prp_page_phys = (u64 *)((u8 *)prp_page_phys + prp_size); + } + + /* + * Set PRP physical pointer, which initially points to the current PRP + * DMA memory page. + */ + prp_entry_phys = prp_page_phys; + + /* Get physical address and length of the data buffer. */ + if (data_in_sz) { + paddr = data_in_dma; + length = data_in_sz; + } else { + paddr = data_out_dma; + length = data_out_sz; + } + + /* Loop while the length is not zero. */ + while (length) { + /* + * Check if we need to put a list pointer here if we are at + * page boundary - prp_size (8 bytes). + */ + page_mask_result = + (uintptr_t)((u8 *)prp_entry_phys + prp_size) & page_mask; + if (!page_mask_result) { + /* + * This is the last entry in a PRP List, so we need to + * put a PRP list pointer here. What this does is: + * - bump the current memory pointer to the next + * address, which will be the next full page. + * - set the PRP Entry to point to that page. This + * is now the PRP List pointer. + * - bump the PRP Entry pointer the start of the + * next page. Since all of this PRP memory is + * contiguous, no need to get a new page - it's + * just the next address. + */ + prp_entry_phys++; + *prp_entry = cpu_to_le64((uintptr_t)prp_entry_phys); + prp_entry++; + } + + /* Need to handle if entry will be part of a page. */ + offset = (u32)paddr & page_mask; + entry_len = ioc->page_size - offset; + + if (prp_entry == prp1_entry) { + /* + * Must fill in the first PRP pointer (PRP1) before + * moving on. + */ + *prp1_entry = cpu_to_le64((u64)paddr); + + /* + * Now point to the second PRP entry within the + * command (PRP2). + */ + prp_entry = prp2_entry; + } else if (prp_entry == prp2_entry) { + /* + * Should the PRP2 entry be a PRP List pointer or just + * a regular PRP pointer? If there is more than one + * more page of data, must use a PRP List pointer. + */ + if (length > ioc->page_size) { + /* + * PRP2 will contain a PRP List pointer because + * more PRP's are needed with this command. The + * list will start at the beginning of the + * contiguous buffer. + */ + *prp2_entry = + cpu_to_le64((uintptr_t)prp_entry_phys); + + /* + * The next PRP Entry will be the start of the + * first PRP List. + */ + prp_entry = prp_page; + } else { + /* + * After this, the PRP Entries are complete. + * This command uses 2 PRP's and no PRP list. + */ + *prp2_entry = cpu_to_le64((u64)paddr); + } + } else { + /* + * Put entry in list and bump the addresses. + * + * After PRP1 and PRP2 are filled in, this will fill in + * all remaining PRP entries in a PRP List, one per + * each time through the loop. + */ + *prp_entry = cpu_to_le64((u64)paddr); + prp_entry++; + prp_entry_phys++; + } + + /* + * Bump the phys address of the command's data buffer by the + * entry_len. + */ + paddr += entry_len; + + /* Decrement length accounting for last partial page. */ + if (entry_len > length) + length = 0; + else + length -= entry_len; + } +} + /** * base_make_prp_nvme - * Prepare PRPs(Physical Region Page)- SGLs specific to NVMe drives only @@ -2794,6 +3020,30 @@ _base_put_smid_hi_priority(struct MPT3SAS_ADAPTER *ioc, u16 smid, } /** + * _base_put_smid_nvme_encap - send NVMe encapsulated request to + * firmware + * @ioc: per adapter object + * @smid: system request message index + * + * Return nothing. + */ +static void +_base_put_smid_nvme_encap(struct MPT3SAS_ADAPTER *ioc, u16 smid) +{ + Mpi2RequestDescriptorUnion_t descriptor; + u64 *request = (u64 *)&descriptor; + + descriptor.Default.RequestFlags = + MPI26_REQ_DESCRIPT_FLAGS_PCIE_ENCAPSULATED; + descriptor.Default.MSIxIndex = _base_get_msix_index(ioc); + descriptor.Default.SMID = cpu_to_le16(smid); + descriptor.Default.LMID = 0; + descriptor.Default.DescriptorTypeDependent = 0; + _base_writeq(*request, &ioc->chip->RequestDescriptorPostLow, + &ioc->scsi_lookup_lock); +} + +/** * _base_put_smid_default - Default, primarily used for config pages * @ioc: per adapter object * @smid: system request message index @@ -2884,6 +3134,27 @@ _base_put_smid_hi_priority_atomic(struct MPT3SAS_ADAPTER *ioc, u16 smid, } /** + * _base_put_smid_nvme_encap_atomic - send NVMe encapsulated request to + * firmware using Atomic Request Descriptor + * @ioc: per adapter object + * @smid: system request message index + * + * Return nothing. + */ +static void +_base_put_smid_nvme_encap_atomic(struct MPT3SAS_ADAPTER *ioc, u16 smid) +{ + Mpi26AtomicRequestDescriptor_t descriptor; + u32 *request = (u32 *)&descriptor; + + descriptor.RequestFlags = MPI26_REQ_DESCRIPT_FLAGS_PCIE_ENCAPSULATED; + descriptor.MSIxIndex = _base_get_msix_index(ioc); + descriptor.SMID = cpu_to_le16(smid); + + writel(cpu_to_le32(*request), &ioc->chip->AtomicRequestDescriptorPost); +} + +/** * _base_put_smid_default - Default, primarily used for config pages * use Atomic Request Descriptor * @ioc: per adapter object @@ -5707,6 +5978,7 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc) */ ioc->build_sg_scmd = &_base_build_sg_scmd_ieee; ioc->build_sg = &_base_build_sg_ieee; + ioc->build_nvme_prp = &_base_build_nvme_prp; ioc->build_zero_len_sge = &_base_build_zero_len_sge_ieee; ioc->sge_size_ieee = sizeof(Mpi2IeeeSgeSimple64_t); @@ -5718,11 +5990,13 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc) ioc->put_smid_scsi_io = &_base_put_smid_scsi_io_atomic; ioc->put_smid_fast_path = &_base_put_smid_fast_path_atomic; ioc->put_smid_hi_priority = &_base_put_smid_hi_priority_atomic; + ioc->put_smid_nvme_encap = &_base_put_smid_nvme_encap_atomic; } else { ioc->put_smid_default = &_base_put_smid_default; ioc->put_smid_scsi_io = &_base_put_smid_scsi_io; ioc->put_smid_fast_path = &_base_put_smid_fast_path; ioc->put_smid_hi_priority = &_base_put_smid_hi_priority; + ioc->put_smid_nvme_encap = &_base_put_smid_nvme_encap; } diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index 4758729..034b34d 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -1184,6 +1184,9 @@ struct MPT3SAS_ADAPTER { MPT_BUILD_SG build_sg_mpi; MPT_BUILD_ZERO_LEN_SGE build_zero_len_sge_mpi; + /* function ptr for NVMe PRP elements only */ + NVME_BUILD_PRP build_nvme_prp; + /* event log */ u32 event_type[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS]; u32 event_context; @@ -1354,6 +1357,7 @@ struct MPT3SAS_ADAPTER { PUT_SMID_IO_FP_HIP put_smid_fast_path; PUT_SMID_IO_FP_HIP put_smid_hi_priority; PUT_SMID_DEFAULT put_smid_default; + PUT_SMID_DEFAULT put_smid_nvme_encap; }; diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index 67c7280..205c443 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -272,6 +272,7 @@ mpt3sas_ctl_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, { MPI2DefaultReply_t *mpi_reply; Mpi2SCSIIOReply_t *scsiio_reply; + Mpi26NVMeEncapsulatedErrorReply_t *nvme_error_reply; const void *sense_data; u32 sz; @@ -298,6 +299,18 @@ mpt3sas_ctl_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, memcpy(ioc->ctl_cmds.sense, sense_data, sz); } } + /* + * Get Error Response data for NVMe device. The ctl_cmds.sense + * buffer is used to store the Error Response data. + */ + if (mpi_reply->Function == MPI2_FUNCTION_NVME_ENCAPSULATED) { + nvme_error_reply = + (Mpi26NVMeEncapsulatedErrorReply_t *)mpi_reply; + sz = min_t(u32, NVME_ERROR_RESPONSE_SIZE, + le32_to_cpu(nvme_error_reply->ErrorResponseCount)); + sense_data = mpt3sas_base_get_sense_buffer(ioc, smid); + memcpy(ioc->ctl_cmds.sense, sense_data, sz); + } } _ctl_display_some_debug(ioc, smid, "ctl_done", mpi_reply); @@ -641,11 +654,12 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg, { MPI2RequestHeader_t *mpi_request = NULL, *request; MPI2DefaultReply_t *mpi_reply; + Mpi26NVMeEncapsulatedRequest_t *nvme_encap_request = NULL; u32 ioc_state; u16 smid; unsigned long timeout; u8 issue_reset; - u32 sz; + u32 sz, sz_arg; void *psge; void *data_out = NULL; dma_addr_t data_out_dma = 0; @@ -742,7 +756,8 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg, if (mpi_request->Function == MPI2_FUNCTION_SCSI_IO_REQUEST || mpi_request->Function == MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH || mpi_request->Function == MPI2_FUNCTION_SCSI_TASK_MGMT || - mpi_request->Function == MPI2_FUNCTION_SATA_PASSTHROUGH) { + mpi_request->Function == MPI2_FUNCTION_SATA_PASSTHROUGH || + mpi_request->Function == MPI2_FUNCTION_NVME_ENCAPSULATED) { device_handle = le16_to_cpu(mpi_request->FunctionDependent1); if (!device_handle || (device_handle > @@ -793,6 +808,38 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg, init_completion(&ioc->ctl_cmds.done); switch (mpi_request->Function) { + case MPI2_FUNCTION_NVME_ENCAPSULATED: + { + nvme_encap_request = (Mpi26NVMeEncapsulatedRequest_t *)request; + /* + * Get the Physical Address of the sense buffer. + * Use Error Response buffer address field to hold the sense + * buffer address. + * Clear the internal sense buffer, which will potentially hold + * the Completion Queue Entry on return, or 0 if no Entry. + * Build the PRPs and set direction bits. + * Send the request. + */ + nvme_encap_request->ErrorResponseBaseAddress = ioc->sense_dma & + 0xFFFFFFFF00000000; + nvme_encap_request->ErrorResponseBaseAddress |= + (U64)mpt3sas_base_get_sense_buffer_dma(ioc, smid); + nvme_encap_request->ErrorResponseAllocationLength = + NVME_ERROR_RESPONSE_SIZE; + memset(ioc->ctl_cmds.sense, 0, NVME_ERROR_RESPONSE_SIZE); + ioc->build_nvme_prp(ioc, smid, nvme_encap_request, + data_out_dma, data_out_sz, data_in_dma, data_in_sz); + if (test_bit(device_handle, ioc->device_remove_in_progress)) { + dtmprintk(ioc, pr_info(MPT3SAS_FMT "handle(0x%04x) :" + "ioctl failed due to device removal in progress\n", + ioc->name, device_handle)); + mpt3sas_base_free_smid(ioc, smid); + ret = -EINVAL; + goto out; + } + ioc->put_smid_nvme_encap(ioc, smid); + break; + } case MPI2_FUNCTION_SCSI_IO_REQUEST: case MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH: { @@ -1008,15 +1055,25 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg, } } - /* copy out sense to user */ + /* copy out sense/NVMe Error Response to user */ if (karg.max_sense_bytes && (mpi_request->Function == MPI2_FUNCTION_SCSI_IO_REQUEST || mpi_request->Function == - MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH)) { - sz = min_t(u32, karg.max_sense_bytes, SCSI_SENSE_BUFFERSIZE); + MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH || mpi_request->Function == + MPI2_FUNCTION_NVME_ENCAPSULATED)) { + if (karg.sense_data_ptr == NULL) { + pr_info(MPT3SAS_FMT "Response buffer provided" + " by application is NULL; Response data will" + " not be returned.\n", ioc->name); + goto out; + } + sz_arg = (mpi_request->Function == + MPI2_FUNCTION_NVME_ENCAPSULATED) ? NVME_ERROR_RESPONSE_SIZE : + SCSI_SENSE_BUFFERSIZE; + sz = min_t(u32, karg.max_sense_bytes, sz_arg); if (copy_to_user(karg.sense_data_ptr, ioc->ctl_cmds.sense, sz)) { pr_err("failure at %s:%d/%s()!\n", __FILE__, - __LINE__, __func__); + __LINE__, __func__); ret = -ENODATA; goto out; } -- 2.5.5