On Thu, 22 Aug 2024 11:37:18 -0700 mhkelley58@xxxxxxxxx wrote: > From: Michael Kelley <mhklinux@xxxxxxxxxxx> > > In a CoCo VM, all DMA-based I/O must use swiotlb bounce buffers > because DMA cannot be done to private (encrypted) portions of VM > memory. The bounce buffer memory is marked shared (decrypted) at > boot time, so I/O is done to/from the bounce buffer memory and then > copied by the CPU to/from the final target memory (i.e, "bounced"). > Storage devices can be large consumers of bounce buffer memory because > it is possible to have large numbers of I/Os in flight across multiple > devices. Bounce buffer memory must be pre-allocated at boot time, and > it is difficult to know how much memory to allocate to handle peak > storage I/O loads. Consequently, bounce buffer memory is typically > over-provisioned, which wastes memory, and may still not avoid a peak > that exhausts bounce buffer memory and cause storage I/O errors. > > For Coco VMs running with NVMe PCI devices, update the driver to > permit bounce buffer throttling. Gate the throttling behavior > on a DMA layer check indicating that throttling is useful, so that > no change occurs in a non-CoCo VM. If throttling is useful, enable > the BLK_MQ_F_BLOCKING flag, and pass the DMA_ATTR_MAY_BLOCK attribute > into dma_map_bvec() and dma_map_sgtable() calls. With these options in > place, DMA map requests are pended when necessary to reduce the > likelihood of usage peaks caused by the NVMe driver that could exhaust > bounce buffer memory and generate errors. > > Signed-off-by: Michael Kelley <mhklinux@xxxxxxxxxxx> LGTM. Reviewed-by: Petr Tesarik <ptesarik@xxxxxxxx> Petr T > --- > drivers/nvme/host/pci.c | 18 ++++++++++++++---- > 1 file changed, 14 insertions(+), 4 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 6cd9395ba9ec..2c39943a87f8 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -156,6 +156,7 @@ struct nvme_dev { > dma_addr_t host_mem_descs_dma; > struct nvme_host_mem_buf_desc *host_mem_descs; > void **host_mem_desc_bufs; > + unsigned long dma_attrs; > unsigned int nr_allocated_queues; > unsigned int nr_write_queues; > unsigned int nr_poll_queues; > @@ -735,7 +736,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, > unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); > unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; > > - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); > + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), > + dev->dma_attrs); > if (dma_mapping_error(dev->dev, iod->first_dma)) > return BLK_STS_RESOURCE; > iod->dma_len = bv->bv_len; > @@ -754,7 +756,8 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, > { > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > > - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); > + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), > + dev->dma_attrs); > if (dma_mapping_error(dev->dev, iod->first_dma)) > return BLK_STS_RESOURCE; > iod->dma_len = bv->bv_len; > @@ -800,7 +803,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, > goto out_free_sg; > > rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), > - DMA_ATTR_NO_WARN); > + dev->dma_attrs | DMA_ATTR_NO_WARN); > if (rc) { > if (rc == -EREMOTEIO) > ret = BLK_STS_TARGET; > @@ -828,7 +831,8 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, > struct nvme_iod *iod = blk_mq_rq_to_pdu(req); > struct bio_vec bv = rq_integrity_vec(req); > > - iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); > + iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), > + dev->dma_attrs); > if (dma_mapping_error(dev->dev, iod->meta_dma)) > return BLK_STS_IOERR; > cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); > @@ -3040,6 +3044,12 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, > * a single integrity segment for the separate metadata pointer. > */ > dev->ctrl.max_integrity_segments = 1; > + > + if (dma_recommend_may_block(dev->dev)) { > + dev->ctrl.blocking = true; > + dev->dma_attrs = DMA_ATTR_MAY_BLOCK; > + } > + > return dev; > > out_put_device: