From: Michael Kelley <mhklinux@xxxxxxxxxxx> In a CoCo VM, all DMA-based I/O must use swiotlb bounce buffers because DMA cannot be done to private (encrypted) portions of VM memory. The bounce buffer memory is marked shared (decrypted) at boot time, so I/O is done to/from the bounce buffer memory and then copied by the CPU to/from the final target memory (i.e, "bounced"). Storage devices can be large consumers of bounce buffer memory because it is possible to have large numbers of I/Os in flight across multiple devices. Bounce buffer memory must be pre-allocated at boot time, and it is difficult to know how much memory to allocate to handle peak storage I/O loads. Consequently, bounce buffer memory is typically over-provisioned, which wastes memory, and may still not avoid a peak that exhausts bounce buffer memory and cause storage I/O errors. For Coco VMs running with NVMe PCI devices, update the driver to permit bounce buffer throttling. Gate the throttling behavior on a DMA layer check indicating that throttling is useful, so that no change occurs in a non-CoCo VM. If throttling is useful, enable the BLK_MQ_F_BLOCKING flag, and pass the DMA_ATTR_MAY_BLOCK attribute into dma_map_bvec() and dma_map_sgtable() calls. With these options in place, DMA map requests are pended when necessary to reduce the likelihood of usage peaks caused by the NVMe driver that could exhaust bounce buffer memory and generate errors. Signed-off-by: Michael Kelley <mhklinux@xxxxxxxxxxx> --- drivers/nvme/host/pci.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6cd9395ba9ec..2c39943a87f8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -156,6 +156,7 @@ struct nvme_dev { dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; void **host_mem_desc_bufs; + unsigned long dma_attrs; unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; @@ -735,7 +736,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), + dev->dma_attrs); if (dma_mapping_error(dev->dev, iod->first_dma)) return BLK_STS_RESOURCE; iod->dma_len = bv->bv_len; @@ -754,7 +756,8 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), + dev->dma_attrs); if (dma_mapping_error(dev->dev, iod->first_dma)) return BLK_STS_RESOURCE; iod->dma_len = bv->bv_len; @@ -800,7 +803,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_free_sg; rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); + dev->dma_attrs | DMA_ATTR_NO_WARN); if (rc) { if (rc == -EREMOTEIO) ret = BLK_STS_TARGET; @@ -828,7 +831,8 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct bio_vec bv = rq_integrity_vec(req); - iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); + iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), + dev->dma_attrs); if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); @@ -3040,6 +3044,12 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, * a single integrity segment for the separate metadata pointer. */ dev->ctrl.max_integrity_segments = 1; + + if (dma_recommend_may_block(dev->dev)) { + dev->ctrl.blocking = true; + dev->dma_attrs = DMA_ATTR_MAY_BLOCK; + } + return dev; out_put_device: -- 2.25.1