On Tue, 4 Apr 2023 12:01:39 -0700 Brett Creeley <brett.creeley@xxxxxxx> wrote: > +static int > +pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, > + struct rb_root_cached *ranges, u32 nnodes, > + u64 *page_size) > +{ > + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; > + u64 region_start, region_size, region_page_size; > + struct pds_lm_dirty_region_info *region_info; > + struct interval_tree_node *node = NULL; > + struct pci_dev *pdev = pds_vfio->pdev; > + u8 max_regions = 0, num_regions; > + dma_addr_t regions_dma = 0; > + u32 num_ranges = nnodes; > + u32 page_count; > + u16 len; > + int err; > + > + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", pds_vfio->vf_id); > + > + if (pds_vfio_dirty_is_enabled(pds_vfio)) > + return -EINVAL; > + > + pds_vfio_dirty_set_enabled(pds_vfio); > + > + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ > + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, &num_regions); > + if (num_regions) { > + dev_err(&pdev->dev, "Dirty tracking already enabled for %d regions\n", > + num_regions); > + err = -EEXIST; > + goto err_out; > + } else if (!max_regions) { > + dev_err(&pdev->dev, "Device doesn't support dirty tracking, max_regions %d\n", > + max_regions); > + err = -EOPNOTSUPP; > + goto err_out; > + } else if (err) { > + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", > + ERR_PTR(err)); > + goto err_out; > + } > + > + /* Only support 1 region for now. If there are any large gaps in the > + * VM's address regions, then this would be a waste of memory as we are > + * generating 2 bitmaps (ack/seq) from the min address to the max > + * address of the VM's address regions. In the future, if we support > + * more than one region in the device/driver we can split the bitmaps > + * on the largest address region gaps. We can do this split up to the > + * max_regions times returned from the dirty_status command. > + */ Large gaps in a VM are very possible, particularly after QEMU has relocated RAM above 4GB to above the reserved hypertransport range on AMD systems. Thanks, Alex > + max_regions = 1; > + if (num_ranges > max_regions) { > + vfio_combine_iova_ranges(ranges, nnodes, max_regions); > + num_ranges = max_regions; > + } > + > + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); > + if (!node) { > + err = -EINVAL; > + goto err_out; > + } > + > + region_size = node->last - node->start + 1; > + region_start = node->start; > + region_page_size = *page_size; > + > + len = sizeof(*region_info); > + region_info = kzalloc(len, GFP_KERNEL); > + if (!region_info) { > + err = -ENOMEM; > + goto err_out; > + } > + > + page_count = DIV_ROUND_UP(region_size, region_page_size); > + > + region_info->dma_base = cpu_to_le64(region_start); > + region_info->page_count = cpu_to_le32(page_count); > + region_info->page_size_log2 = ilog2(region_page_size); > + > + regions_dma = dma_map_single(pds_vfio->coredev, (void *)region_info, len, > + DMA_BIDIRECTIONAL); > + if (dma_mapping_error(pds_vfio->coredev, regions_dma)) { > + err = -ENOMEM; > + kfree(region_info); > + goto err_out; > + } > + > + err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions); > + dma_unmap_single(pds_vfio->coredev, regions_dma, len, DMA_BIDIRECTIONAL); > + /* page_count might be adjusted by the device, > + * update it before freeing region_info DMA > + */ > + page_count = le32_to_cpu(region_info->page_count); > + > + dev_dbg(&pdev->dev, "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n", > + regions_dma, region_start, page_count, (u8)ilog2(region_page_size)); > + > + kfree(region_info); > + if (err) > + goto err_out; > + > + err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count); > + if (err) { > + dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", > + ERR_PTR(err)); > + goto err_out; > + } > + > + err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count); > + if (err) { > + dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", > + ERR_PTR(err)); > + goto err_free_bitmaps; > + } > + > + dirty->region_start = region_start; > + dirty->region_size = region_size; > + dirty->region_page_size = region_page_size; > + > + pds_vfio_print_guest_region_info(pds_vfio, max_regions); > + > + return 0; > + > +err_free_bitmaps: > + pds_vfio_dirty_free_bitmaps(dirty); > +err_out: > + pds_vfio_dirty_set_disabled(pds_vfio); > + return err; > +}