On 2024/12/17 0:53, Manivannan Sadhasivam wrote: > [...] > >> +/* >> + * PCI EPF driver private data. >> + */ >> +struct nvmet_pciep_epf { > > nvmet_pci_epf? > >> + struct pci_epf *epf; >> + >> + const struct pci_epc_features *epc_features; >> + >> + void *reg_bar; >> + size_t msix_table_offset; >> + >> + unsigned int irq_type; >> + unsigned int nr_vectors; >> + >> + struct nvmet_pciep_ctrl ctrl; >> + >> + struct dma_chan *dma_tx_chan; >> + struct mutex dma_tx_lock; >> + struct dma_chan *dma_rx_chan; >> + struct mutex dma_rx_lock; >> + >> + struct mutex mmio_lock; >> + >> + /* PCI endpoint function configfs attributes */ >> + struct config_group group; >> + bool dma_enable; >> + __le16 portid; >> + char subsysnqn[NVMF_NQN_SIZE]; >> + unsigned int mdts_kb; >> +}; >> + >> +static inline u32 nvmet_pciep_bar_read32(struct nvmet_pciep_ctrl *ctrl, u32 off) >> +{ >> + __le32 *bar_reg = ctrl->bar + off; >> + >> + return le32_to_cpu(READ_ONCE(*bar_reg)); > > Looks like you can use readl/writel variants here. Any reason to not use them? A bar memory comes from dma_alloc_coherent(), which is a "void *" pointer and *not* iomem. So using readl/writel for accessing that memory seems awefully wrong to me. >> +static bool nvmet_pciep_epf_init_dma(struct nvmet_pciep_epf *nvme_epf) >> +{ >> + struct pci_epf *epf = nvme_epf->epf; >> + struct device *dev = &epf->dev; >> + struct nvmet_pciep_epf_dma_filter filter; >> + struct dma_chan *chan; >> + dma_cap_mask_t mask; >> + >> + mutex_init(&nvme_epf->dma_rx_lock); >> + mutex_init(&nvme_epf->dma_tx_lock); >> + >> + dma_cap_zero(mask); >> + dma_cap_set(DMA_SLAVE, mask); >> + >> + filter.dev = epf->epc->dev.parent; >> + filter.dma_mask = BIT(DMA_DEV_TO_MEM); >> + >> + chan = dma_request_channel(mask, nvmet_pciep_epf_dma_filter, &filter); >> + if (!chan) >> + return false; > > You should also destroy mutexes in error path. Good catch. Will add that. [...] >> +static int nvmet_pciep_epf_dma_transfer(struct nvmet_pciep_epf *nvme_epf, >> + struct nvmet_pciep_segment *seg, enum dma_data_direction dir) >> +{ >> + struct pci_epf *epf = nvme_epf->epf; >> + struct dma_async_tx_descriptor *desc; >> + struct dma_slave_config sconf = {}; >> + struct device *dev = &epf->dev; >> + struct device *dma_dev; >> + struct dma_chan *chan; >> + dma_cookie_t cookie; >> + dma_addr_t dma_addr; >> + struct mutex *lock; >> + int ret; >> + >> + switch (dir) { >> + case DMA_FROM_DEVICE: >> + lock = &nvme_epf->dma_rx_lock; >> + chan = nvme_epf->dma_rx_chan; >> + sconf.direction = DMA_DEV_TO_MEM; >> + sconf.src_addr = seg->pci_addr; >> + break; >> + case DMA_TO_DEVICE: >> + lock = &nvme_epf->dma_tx_lock; >> + chan = nvme_epf->dma_tx_chan; >> + sconf.direction = DMA_MEM_TO_DEV; >> + sconf.dst_addr = seg->pci_addr; >> + break; >> + default: >> + return -EINVAL; >> + } >> + >> + mutex_lock(lock); >> + >> + dma_dev = dmaengine_get_dma_device(chan); >> + dma_addr = dma_map_single(dma_dev, seg->buf, seg->length, dir); >> + ret = dma_mapping_error(dma_dev, dma_addr); >> + if (ret) >> + goto unlock; >> + >> + ret = dmaengine_slave_config(chan, &sconf); >> + if (ret) { >> + dev_err(dev, "Failed to configure DMA channel\n"); >> + goto unmap; >> + } >> + >> + desc = dmaengine_prep_slave_single(chan, dma_addr, seg->length, >> + sconf.direction, DMA_CTRL_ACK); >> + if (!desc) { >> + dev_err(dev, "Failed to prepare DMA\n"); >> + ret = -EIO; >> + goto unmap; >> + } >> + >> + cookie = dmaengine_submit(desc); >> + ret = dma_submit_error(cookie); >> + if (ret) { >> + dev_err(dev, "DMA submit failed %d\n", ret); >> + goto unmap; >> + } >> + >> + if (dma_sync_wait(chan, cookie) != DMA_COMPLETE) { > > Why do you need to do sync tranfer all the time? This defeats the purpose of > using DMA. You may be getting confused about the meaning of sync here. This simply means "spin wait for the completion of the transfer". I initially was using the completion callback like in pci-epf-tets and othe EPF drivers doing DMA, but that causes a context switch for every transfer, even small ones, which really hurts performance. Switching to using dma_sync_wait() avoids this context switch while achieving what we need, which is to wait for the end of the transfer. I nearly doubled the max IOPS for small IOs with this. Note that we cannot easily do asynchronous DMA transfers with the current DMA slave API, unless we create some special work item responsible for DMA transfers. I did not try to micro optimize this driver too much for this first drop. Any improvement around how data transfers are done can come later. The preformance I am getting with this code is decent enough as it is. [...] >> +static inline int nvmet_pciep_epf_transfer(struct nvmet_pciep_epf *nvme_epf, > > No need to add 'inline' keyword in .c files. Yes there is. Because that funtion is tiny and I really want it to be forced to be inlined. [...] >> +invalid_offset: >> + dev_err(ctrl->dev, "PRPs list invalid offset\n"); >> + kfree(prps); >> + iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR; >> + return -EINVAL; >> + >> +invalid_field: >> + dev_err(ctrl->dev, "PRPs list invalid field\n"); >> + kfree(prps); >> + iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; >> + return -EINVAL; >> + >> +internal: >> + dev_err(ctrl->dev, "PRPs list internal error\n"); >> + kfree(prps); >> + iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR; >> + return -EINVAL; > > Can't you organize the labels in such a way that there is only one return path? > Current code makes it difficult to read and also would confuse the static > checkers. I do not see how this is difficult to read... But sure, I can try to simplify this. > [...] > >> +static void nvmet_pciep_init_bar(struct nvmet_pciep_ctrl *ctrl) >> +{ >> + struct nvmet_ctrl *tctrl = ctrl->tctrl; >> + >> + ctrl->bar = ctrl->nvme_epf->reg_bar; >> + >> + /* Copy the target controller capabilities as a base */ >> + ctrl->cap = tctrl->cap; >> + >> + /* Contiguous Queues Required (CQR) */ >> + ctrl->cap |= 0x1ULL << 16; >> + >> + /* Set Doorbell stride to 4B (DSTRB) */ >> + ctrl->cap &= ~GENMASK(35, 32); >> + >> + /* Clear NVM Subsystem Reset Supported (NSSRS) */ >> + ctrl->cap &= ~(0x1ULL << 36); >> + >> + /* Clear Boot Partition Support (BPS) */ >> + ctrl->cap &= ~(0x1ULL << 45); >> + >> + /* Clear Persistent Memory Region Supported (PMRS) */ >> + ctrl->cap &= ~(0x1ULL << 56); >> + >> + /* Clear Controller Memory Buffer Supported (CMBS) */ >> + ctrl->cap &= ~(0x1ULL << 57); > > Can you use macros for these? I could. But that would mean defining *all* bits of the cap register, which is a lot... The NVMe code does not have all these defined now. That is a cleanup that can come later I think. [...] >> + if (epc_features->msix_capable) { >> + size_t pba_size; >> + >> + msix_table_size = PCI_MSIX_ENTRY_SIZE * epf->msix_interrupts; >> + nvme_epf->msix_table_offset = reg_size; >> + pba_size = ALIGN(DIV_ROUND_UP(epf->msix_interrupts, 8), 8); >> + >> + reg_size += msix_table_size + pba_size; >> + } >> + >> + reg_bar_size = ALIGN(reg_size, max(epc_features->align, 4096)); > > From where does this 4k alignment comes from? NVMe spec? If so, is it OK to use > fixed_size BAR? NVMe BAR cannot be fixed size as its size depends on the number of queue pairs the controller can support. Will check if the 4K alignment is mandated by the specs. But it sure does not hurt... [...] >> + /* Create the target controller. */ >> + ret = nvmet_pciep_create_ctrl(nvme_epf, max_nr_queues); >> + if (ret) { >> + dev_err(&epf->dev, >> + "Create NVMe PCI target controller failed\n"); > > Failed to create NVMe PCI target controller How is that better ? > >> + return ret; >> + } >> + >> + if (epf->vfunc_no <= 1) { > > Are you really supporting virtual functions? If supported, 'vfunc_no < 1' is not > possible. NVMe does support virtual functions and nothing in the configuration path prevents the user from setting one such function. So yes, this is supposed to work if the endpoint controller supports it. Though I must say that I have not tried/tested yet. > >> + /* Set device ID, class, etc */ >> + epf->header->vendorid = ctrl->tctrl->subsys->vendor_id; >> + epf->header->subsys_vendor_id = >> + ctrl->tctrl->subsys->subsys_vendor_id; > > Why these are coming from somewhere else and not configured within the EPF > driver? They are set through the nvme target configfs. So there is no need to have these again setup through the epf configfs. We just grab the values set for the NVME target subsystem config. >> +static int nvmet_pciep_epf_link_up(struct pci_epf *epf) >> +{ >> + struct nvmet_pciep_epf *nvme_epf = epf_get_drvdata(epf); >> + struct nvmet_pciep_ctrl *ctrl = &nvme_epf->ctrl; >> + >> + dev_info(nvme_epf->ctrl.dev, "PCI link up\n"); > > These prints are supposed to come from the controller drivers. So no need to > have them here also. Nope, the controller driver does not print anything. At least the DWC driver does not print anything. >> +static ssize_t nvmet_pciep_epf_dma_enable_store(struct config_item *item, >> + const char *page, size_t len) >> +{ >> + struct config_group *group = to_config_group(item); >> + struct nvmet_pciep_epf *nvme_epf = to_nvme_epf(group); >> + int ret; >> + >> + if (nvme_epf->ctrl.tctrl) >> + return -EBUSY; >> + >> + ret = kstrtobool(page, &nvme_epf->dma_enable); >> + if (ret) >> + return ret; >> + >> + return len; >> +} >> + >> +CONFIGFS_ATTR(nvmet_pciep_epf_, dma_enable); > > What is the use of making this option user configurable? It is purely a hardware > capability and I don't think users would want to have their NVMe device working > without DMA voluntarily. If you feel strongly about it, I can drop this. But it is useful for debugging purpose to check that the DMA API is being used and working correctly. >> +static int __init nvmet_pciep_init_module(void) >> +{ >> + int ret; >> + >> + ret = pci_epf_register_driver(&nvmet_pciep_epf_driver); >> + if (ret) >> + return ret; >> + >> + ret = nvmet_register_transport(&nvmet_pciep_fabrics_ops); > > What is the need to register the transport so early? You should consider moving > the registration to bind() so that the transport can be removed once the driver > is unbind with the controller. This registration is needed to enable the configfs configuration of the nvme target subsystem and port. Withoutn this, setting a port transporet type to "pci" would fail to find the pci transport implemented here and the configuration would fail. This configuration of the nvme target susbsystem and port must come before configuring the EPF and binding it. -- Damien Le Moal Western Digital Research