Re: [PATCH v13 vfio 5/7] vfio/pds: Add support for dirty page tracking

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/3/2023 5:43 AM, Shameerali Kolothum Thodi wrote:
Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.


Hi Brett,

-----Original Message-----
From: Brett Creeley [mailto:brett.creeley@xxxxxxx]
Sent: 25 July 2023 22:40
To: kvm@xxxxxxxxxxxxxxx; netdev@xxxxxxxxxxxxxxx;
alex.williamson@xxxxxxxxxx; jgg@xxxxxxxxxx; yishaih@xxxxxxxxxx;
Shameerali Kolothum Thodi <shameerali.kolothum.thodi@xxxxxxxxxx>;
kevin.tian@xxxxxxxxx
Cc: simon.horman@xxxxxxxxxxxx; brett.creeley@xxxxxxx;
shannon.nelson@xxxxxxx
Subject: [PATCH v13 vfio 5/7] vfio/pds: Add support for dirty page tracking

[...]

+static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
+                              struct rb_root_cached *ranges, u32 nnodes,
+                              u64 *page_size)
+{
+     struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+     struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+     struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+     u64 region_start, region_size, region_page_size;
+     struct pds_lm_dirty_region_info *region_info;
+     struct interval_tree_node *node = NULL;
+     u8 max_regions = 0, num_regions;
+     dma_addr_t regions_dma = 0;
+     u32 num_ranges = nnodes;
+     u32 page_count;
+     u16 len;
+     int err;
+
+     dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
+             pds_vfio->vf_id);
+
+     if (pds_vfio_dirty_is_enabled(pds_vfio))
+             return -EINVAL;
+
+     pds_vfio_dirty_set_enabled(pds_vfio);

Any reason why this is set here? It looks to me you could set this at the
end if everything goes well and avoid below goto out_set_disabled s.
Not sure I am missing anything obvious here.

Thanks,
Shameer.

No, this is a good catch and simplifies this function a bit. I will respin and fix this.

Thanks for the review,

Brett
+
+     /* find if dirty tracking is disabled, i.e. num_regions == 0 */
+     err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
+                                     &num_regions);
+     if (err < 0) {
+             dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
+                     ERR_PTR(err));
+             goto out_set_disabled;
+     } else if (num_regions) {
+             dev_err(&pdev->dev,
+                     "Dirty tracking already enabled for %d regions\n",
+                     num_regions);
+             err = -EEXIST;
+             goto out_set_disabled;
+     } else if (!max_regions) {
+             dev_err(&pdev->dev,
+                     "Device doesn't support dirty tracking, max_regions %d\n",
+                     max_regions);
+             err = -EOPNOTSUPP;
+             goto out_set_disabled;
+     }
+
+     /*
+      * Only support 1 region for now. If there are any large gaps in the
+      * VM's address regions, then this would be a waste of memory as we
are
+      * generating 2 bitmaps (ack/seq) from the min address to the max
+      * address of the VM's address regions. In the future, if we support
+      * more than one region in the device/driver we can split the bitmaps
+      * on the largest address region gaps. We can do this split up to the
+      * max_regions times returned from the dirty_status command.
+      */
+     max_regions = 1;
+     if (num_ranges > max_regions) {
+             vfio_combine_iova_ranges(ranges, nnodes, max_regions);
+             num_ranges = max_regions;
+     }
+
+     node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
+     if (!node) {
+             err = -EINVAL;
+             goto out_set_disabled;
+     }
+
+     region_size = node->last - node->start + 1;
+     region_start = node->start;
+     region_page_size = *page_size;
+
+     len = sizeof(*region_info);
+     region_info = kzalloc(len, GFP_KERNEL);
+     if (!region_info) {
+             err = -ENOMEM;
+             goto out_set_disabled;
+     }
+
+     page_count = DIV_ROUND_UP(region_size, region_page_size);
+
+     region_info->dma_base = cpu_to_le64(region_start);
+     region_info->page_count = cpu_to_le32(page_count);
+     region_info->page_size_log2 = ilog2(region_page_size);
+
+     regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
+                                  DMA_BIDIRECTIONAL);
+     if (dma_mapping_error(pdsc_dev, regions_dma)) {
+             err = -ENOMEM;
+             goto out_free_region_info;
+     }
+
+     err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
+     dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
+     if (err)
+             goto out_free_region_info;
+
+     /*
+      * page_count might be adjusted by the device,
+      * update it before freeing region_info DMA
+      */
+     page_count = le32_to_cpu(region_info->page_count);
+
+     dev_dbg(&pdev->dev,
+             "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u
page_size_log2 %u\n",
+             regions_dma, region_start, page_count,
+             (u8)ilog2(region_page_size));
+
+     err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
+     if (err) {
+             dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
+                     ERR_PTR(err));
+             goto out_free_region_info;
+     }
+
+     err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
+     if (err) {
+             dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
+                     ERR_PTR(err));
+             goto out_free_bitmaps;
+     }
+
+     dirty->region_start = region_start;
+     dirty->region_size = region_size;
+     dirty->region_page_size = region_page_size;
+
+     pds_vfio_print_guest_region_info(pds_vfio, max_regions);
+
+     kfree(region_info);
+
+     return 0;
+
+out_free_bitmaps:
+     pds_vfio_dirty_free_bitmaps(dirty);
+out_free_region_info:
+     kfree(region_info);
+out_set_disabled:
+     pds_vfio_dirty_set_disabled(pds_vfio);
+     return err;
+}
+
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool
send_cmd)
+{
+     if (pds_vfio_dirty_is_enabled(pds_vfio)) {
+             pds_vfio_dirty_set_disabled(pds_vfio);
+             if (send_cmd)
+                     pds_vfio_dirty_disable_cmd(pds_vfio);
+             pds_vfio_dirty_free_sgl(pds_vfio);
+             pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
+     }
+
+     if (send_cmd)
+             pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
PDS_LM_STA_NONE);
+}
+
+static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
+                               struct pds_vfio_bmp_info *bmp_info,
+                               u32 offset, u32 bmp_bytes, bool read_seq)
+{
+     const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
+     u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+     struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+     struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+     unsigned long long npages;
+     struct sg_table sg_table;
+     struct scatterlist *sg;
+     struct page **pages;
+     u32 page_offset;
+     const void *bmp;
+     size_t size;
+     u16 num_sge;
+     int err;
+     int i;
+
+     bmp = (void *)((u64)bmp_info->bmp + offset);
+     page_offset = offset_in_page(bmp);
+     bmp -= page_offset;
+
+     /*
+      * Start and end of bitmap section to seq/ack might not be page
+      * aligned, so use the page_offset to account for that so there
+      * will be enough pages to represent the bmp_bytes
+      */
+     npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
+     pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+     if (!pages)
+             return -ENOMEM;
+
+     for (unsigned long long i = 0; i < npages; i++) {
+             struct page *page = vmalloc_to_page(bmp);
+
+             if (!page) {
+                     err = -EFAULT;
+                     goto out_free_pages;
+             }
+
+             pages[i] = page;
+             bmp += PAGE_SIZE;
+     }
+
+     err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
+                                     bmp_bytes, GFP_KERNEL);
+     if (err)
+             goto out_free_pages;
+
+     err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+     if (err)
+             goto out_free_sg_table;
+
+     for_each_sgtable_dma_sg(&sg_table, sg, i) {
+             struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
+
+             sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
+             sg_elem->len = cpu_to_le32(sg_dma_len(sg));
+     }
+
+     num_sge = sg_table.nents;
+     size = num_sge * sizeof(struct pds_lm_sg_elem);
+     dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size,
dma_dir);
+     err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr,
num_sge,
+                                      offset, bmp_bytes, read_seq);
+     if (err)
+             dev_err(&pdev->dev,
+                     "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u
DMA 0x%llx: %pe\n",
+                     bmp_type_str, offset, bmp_bytes,
+                     num_sge, bmp_info->sgl_addr, ERR_PTR(err));
+     dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size,
dma_dir);
+
+     dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+out_free_sg_table:
+     sg_free_table(&sg_table);
+out_free_pages:
+     kfree(pages);
+
+     return err;
+}
+
+static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
+                                 u32 offset, u32 len)
+{
+     return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
+                                   offset, len, WRITE_ACK);
+}
+
+static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
+                                u32 offset, u32 len)
+{
+     return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
+                                   offset, len, READ_SEQ);
+}
+
+static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device
*pds_vfio,
+                                       struct iova_bitmap *dirty_bitmap,
+                                       u32 bmp_offset, u32 len_bytes)
+{
+     u64 page_size = pds_vfio->dirty.region_page_size;
+     u64 region_start = pds_vfio->dirty.region_start;
+     u32 bmp_offset_bit;
+     __le64 *seq, *ack;
+     int dword_count;
+
+     dword_count = len_bytes / sizeof(u64);
+     seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
+     ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
+     bmp_offset_bit = bmp_offset * 8;
+
+     for (int i = 0; i < dword_count; i++) {
+             u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
+
+             /* prepare for next write_ack call */
+             ack[i] = seq[i];
+
+             for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
+                     if (xor & BIT(bit_i)) {
+                             u64 abs_bit_i = bmp_offset_bit +
+                                             i * BITS_PER_TYPE(u64) + bit_i;
+                             u64 addr = abs_bit_i * page_size + region_start;
+
+                             iova_bitmap_set(dirty_bitmap, addr, page_size);
+                     }
+             }
+     }
+
+     return 0;
+}
+
+static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
+                            struct iova_bitmap *dirty_bitmap,
+                            unsigned long iova, unsigned long length)
+{
+     struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+     struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+     u64 bmp_offset, bmp_bytes;
+     u64 bitmap_size, pages;
+     int err;
+
+     dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
+
+     if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
+             dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
+                     pds_vfio->vf_id);
+             return -EINVAL;
+     }
+
+     pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
+     bitmap_size =
+             round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
+
+     dev_dbg(dev,
+             "vf%u: iova 0x%lx length %lu page_size %llu pages %llu
bitmap_size %llu\n",
+             pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
+             pages, bitmap_size);
+
+     if (!length || ((dirty->region_start + iova + length) >
+                     (dirty->region_start + dirty->region_size))) {
+             dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
+                     iova, length);
+             return -EINVAL;
+     }
+
+     /* bitmap is modified in 64 bit chunks */
+     bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
+                                    sizeof(u64)),
+                       sizeof(u64));
+     if (bmp_bytes != bitmap_size) {
+             dev_err(dev,
+                     "Calculated bitmap bytes %llu not equal to bitmap
size %llu\n",
+                     bmp_bytes, bitmap_size);
+             return -EINVAL;
+     }
+
+     bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size,
sizeof(u64));
+
+     dev_dbg(dev,
+             "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu
bmp_bytes %llu\n",
+             iova, length, bmp_offset, bmp_bytes);
+
+     err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
+     if (err)
+             return err;
+
+     err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap,
bmp_offset,
+                                          bmp_bytes);
+     if (err)
+             return err;
+
+     err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
+     if (err)
+             return err;
+
+     return 0;
+}
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long
iova,
+                             unsigned long length, struct iova_bitmap *dirty)
+{
+     struct pds_vfio_pci_device *pds_vfio =
+             container_of(vdev, struct pds_vfio_pci_device,
+                          vfio_coredev.vdev);
+     int err;
+
+     mutex_lock(&pds_vfio->state_mutex);
+     err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
+     pds_vfio_state_mutex_unlock(pds_vfio);
+
+     return err;
+}
+
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+                            struct rb_root_cached *ranges, u32 nnodes,
+                            u64 *page_size)
+{
+     struct pds_vfio_pci_device *pds_vfio =
+             container_of(vdev, struct pds_vfio_pci_device,
+                          vfio_coredev.vdev);
+     int err;
+
+     mutex_lock(&pds_vfio->state_mutex);
+     pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
PDS_LM_STA_IN_PROGRESS);
+     err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
+     pds_vfio_state_mutex_unlock(pds_vfio);
+
+     return err;
+}
+
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
+{
+     struct pds_vfio_pci_device *pds_vfio =
+             container_of(vdev, struct pds_vfio_pci_device,
+                          vfio_coredev.vdev);
+
+     mutex_lock(&pds_vfio->state_mutex);
+     pds_vfio_dirty_disable(pds_vfio, true);
+     pds_vfio_state_mutex_unlock(pds_vfio);
+
+     return 0;
+}
diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
new file mode 100644
index 000000000000..f78da25d75ca
--- /dev/null
+++ b/drivers/vfio/pci/pds/dirty.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _DIRTY_H_
+#define _DIRTY_H_
+
+struct pds_vfio_bmp_info {
+     unsigned long *bmp;
+     u32 bmp_bytes;
+     struct pds_lm_sg_elem *sgl;
+     dma_addr_t sgl_addr;
+     u16 num_sge;
+};
+
+struct pds_vfio_dirty {
+     struct pds_vfio_bmp_info host_seq;
+     struct pds_vfio_bmp_info host_ack;
+     u64 region_size;
+     u64 region_start;
+     u64 region_page_size;
+     bool is_enabled;
+};
+
+struct pds_vfio_pci_device;
+
+bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio,
+                         bool send_cmd);
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long
iova,
+                             unsigned long length,
+                             struct iova_bitmap *dirty);
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+                            struct rb_root_cached *ranges, u32 nnodes,
+                            u64 *page_size);
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev);
+#endif /* _DIRTY_H_ */
diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
index 7e319529cf74..aec75574cab3 100644
--- a/drivers/vfio/pci/pds/lm.c
+++ b/drivers/vfio/pci/pds/lm.c
@@ -371,7 +371,7 @@ pds_vfio_step_device_state_locked(struct
pds_vfio_pci_device *pds_vfio,

       if (cur == VFIO_DEVICE_STATE_STOP_COPY && next ==
VFIO_DEVICE_STATE_STOP) {
               pds_vfio_put_save_file(pds_vfio);
-             pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
PDS_LM_STA_NONE);
+             pds_vfio_dirty_disable(pds_vfio, true);
               return NULL;
       }

diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index b37ef96a7fd8..9e6a96b5db62 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -5,6 +5,7 @@
  #include <linux/vfio_pci_core.h>

  #include "lm.h"
+#include "dirty.h"
  #include "vfio_dev.h"

  struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio)
@@ -25,7 +26,7 @@ struct pds_vfio_pci_device
*pds_vfio_pci_drvdata(struct pci_dev *pdev)
                           vfio_coredev);
  }

-static void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device
*pds_vfio)
+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
  {
  again:
       spin_lock(&pds_vfio->reset_lock);
@@ -35,6 +36,7 @@ static void pds_vfio_state_mutex_unlock(struct
pds_vfio_pci_device *pds_vfio)
                       pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
                       pds_vfio_put_restore_file(pds_vfio);
                       pds_vfio_put_save_file(pds_vfio);
+                     pds_vfio_dirty_disable(pds_vfio, false);
               }
               spin_unlock(&pds_vfio->reset_lock);
               goto again;
@@ -117,6 +119,12 @@ static const struct vfio_migration_ops
pds_vfio_lm_ops = {
       .migration_get_data_size = pds_vfio_get_device_state_size
  };

+static const struct vfio_log_ops pds_vfio_log_ops = {
+     .log_start = pds_vfio_dma_logging_start,
+     .log_stop = pds_vfio_dma_logging_stop,
+     .log_read_and_clear = pds_vfio_dma_logging_report,
+};
+
  static int pds_vfio_init_device(struct vfio_device *vdev)
  {
       struct pds_vfio_pci_device *pds_vfio =
@@ -137,6 +145,7 @@ static int pds_vfio_init_device(struct vfio_device
*vdev)

       vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
VFIO_MIGRATION_P2P;
       vdev->mig_ops = &pds_vfio_lm_ops;
+     vdev->log_ops = &pds_vfio_log_ops;

       pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn);
       dev_dbg(&pdev->dev,
@@ -175,6 +184,7 @@ static void pds_vfio_close_device(struct vfio_device
*vdev)
       mutex_lock(&pds_vfio->state_mutex);
       pds_vfio_put_restore_file(pds_vfio);
       pds_vfio_put_save_file(pds_vfio);
+     pds_vfio_dirty_disable(pds_vfio, true);
       mutex_unlock(&pds_vfio->state_mutex);
       mutex_destroy(&pds_vfio->state_mutex);
       vfio_pci_core_close_device(vdev);
diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
index 31bd14de0c91..8109fe101694 100644
--- a/drivers/vfio/pci/pds/vfio_dev.h
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -7,6 +7,7 @@
  #include <linux/pci.h>
  #include <linux/vfio_pci_core.h>

+#include "dirty.h"
  #include "lm.h"

  struct pdsc;
@@ -17,6 +18,7 @@ struct pds_vfio_pci_device {

       struct pds_vfio_lm_file *save_file;
       struct pds_vfio_lm_file *restore_file;
+     struct pds_vfio_dirty dirty;
       struct mutex state_mutex; /* protect migration state */
       enum vfio_device_mig_state state;
       spinlock_t reset_lock; /* protect reset_done flow */
@@ -26,6 +28,8 @@ struct pds_vfio_pci_device {
       u16 client_id;
  };

+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
+
  const struct vfio_device_ops *pds_vfio_ops_info(void);
  struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
  void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
diff --git a/include/linux/pds/pds_adminq.h
b/include/linux/pds/pds_adminq.h
index 9c79b3c8fc47..4b4e9a98b37b 100644
--- a/include/linux/pds/pds_adminq.h
+++ b/include/linux/pds/pds_adminq.h
@@ -835,6 +835,13 @@ enum pds_lm_cmd_opcode {
       PDS_LM_CMD_RESUME          = 20,
       PDS_LM_CMD_SAVE            = 21,
       PDS_LM_CMD_RESTORE         = 22,
+
+     /* Dirty page tracking commands */
+     PDS_LM_CMD_DIRTY_STATUS    = 32,
+     PDS_LM_CMD_DIRTY_ENABLE    = 33,
+     PDS_LM_CMD_DIRTY_DISABLE   = 34,
+     PDS_LM_CMD_DIRTY_READ_SEQ  = 35,
+     PDS_LM_CMD_DIRTY_WRITE_ACK = 36,
  };

  /**
@@ -992,6 +999,172 @@ enum pds_lm_host_vf_status {
       PDS_LM_STA_MAX,
  };

+/**
+ * struct pds_lm_dirty_region_info - Memory region info for STATUS and
ENABLE
+ * @dma_base:                Base address of the DMA-contiguous memory region
+ * @page_count:              Number of pages in the memory region
+ * @page_size_log2:  Log2 page size in the memory region
+ * @rsvd:            Word boundary padding
+ */
+struct pds_lm_dirty_region_info {
+     __le64 dma_base;
+     __le32 page_count;
+     u8     page_size_log2;
+     u8     rsvd[3];
+};
+
+/**
+ * struct pds_lm_dirty_status_cmd - DIRTY_STATUS command
+ * @opcode:          Opcode PDS_LM_CMD_DIRTY_STATUS
+ * @rsvd:            Word boundary padding
+ * @vf_id:           VF id
+ * @max_regions:     Capacity of the region info buffer
+ * @rsvd2:           Word boundary padding
+ * @regions_dma:     DMA address of the region info buffer
+ *
+ * The minimum of max_regions (from the command) and num_regions
(from the
+ * completion) of struct pds_lm_dirty_region_info will be written to
+ * regions_dma.
+ *
+ * The max_regions may be zero, in which case regions_dma is ignored.  In
that
+ * case, the completion will only report the maximum number of regions
+ * supported by the device, and the number of regions currently enabled.
+ */
+struct pds_lm_dirty_status_cmd {
+     u8     opcode;
+     u8     rsvd;
+     __le16 vf_id;
+     u8     max_regions;
+     u8     rsvd2[3];
+     __le64 regions_dma;
+} __packed;
+
+/**
+ * enum pds_lm_dirty_bmp_type - Type of dirty page bitmap
+ * @PDS_LM_DIRTY_BMP_TYPE_NONE: No bitmap / disabled
+ * @PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK: Seq/Ack bitmap representation
+ */
+enum pds_lm_dirty_bmp_type {
+     PDS_LM_DIRTY_BMP_TYPE_NONE     = 0,
+     PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK  = 1,
+};
+
+/**
+ * struct pds_lm_dirty_status_comp - STATUS command completion
+ * @status:          Status of the command (enum pds_core_status_code)
+ * @rsvd:            Word boundary padding
+ * @comp_index:              Index in the desc ring for which this is the
completion
+ * @max_regions:     Maximum number of regions supported by the
device
+ * @num_regions:     Number of regions currently enabled
+ * @bmp_type:                Type of dirty bitmap representation
+ * @rsvd2:           Word boundary padding
+ * @bmp_type_mask:   Mask of supported bitmap types, bit index per type
+ * @rsvd3:           Word boundary padding
+ * @color:           Color bit
+ *
+ * This completion descriptor is used for STATUS, ENABLE, and DISABLE.
+ */
+struct pds_lm_dirty_status_comp {
+     u8     status;
+     u8     rsvd;
+     __le16 comp_index;
+     u8     max_regions;
+     u8     num_regions;
+     u8     bmp_type;
+     u8     rsvd2;
+     __le32 bmp_type_mask;
+     u8     rsvd3[3];
+     u8     color;
+};
+
+/**
+ * struct pds_lm_dirty_enable_cmd - DIRTY_ENABLE command
+ * @opcode:          Opcode PDS_LM_CMD_DIRTY_ENABLE
+ * @rsvd:            Word boundary padding
+ * @vf_id:           VF id
+ * @bmp_type:                Type of dirty bitmap representation
+ * @num_regions:     Number of entries in the region info buffer
+ * @rsvd2:           Word boundary padding
+ * @regions_dma:     DMA address of the region info buffer
+ *
+ * The num_regions must be nonzero, and less than or equal to the
maximum
+ * number of regions supported by the device.
+ *
+ * The memory regions should not overlap.
+ *
+ * The information should be initialized by the driver.  The device may
modify
+ * the information on successful completion, such as by size-aligning the
+ * number of pages in a region.
+ *
+ * The modified number of pages will be greater than or equal to the page
count
+ * given in the enable command, and at least as coarsly aligned as the given
+ * value.  For example, the count might be aligned to a multiple of 64, but
+ * if the value is already a multiple of 128 or higher, it will not change.
+ * If the driver requires its own minimum alignment of the number of pages,
the
+ * driver should account for that already in the region info of this command.
+ *
+ * This command uses struct pds_lm_dirty_status_comp for its completion.
+ */
+struct pds_lm_dirty_enable_cmd {
+     u8     opcode;
+     u8     rsvd;
+     __le16 vf_id;
+     u8     bmp_type;
+     u8     num_regions;
+     u8     rsvd2[2];
+     __le64 regions_dma;
+} __packed;
+
+/**
+ * struct pds_lm_dirty_disable_cmd - DIRTY_DISABLE command
+ * @opcode:  Opcode PDS_LM_CMD_DIRTY_DISABLE
+ * @rsvd:    Word boundary padding
+ * @vf_id:   VF id
+ *
+ * Dirty page tracking will be disabled.  This may be called in any state, as
+ * long as dirty page tracking is supported by the device, to ensure that dirty
+ * page tracking is disabled.
+ *
+ * This command uses struct pds_lm_dirty_status_comp for its completion.
On
+ * success, num_regions will be zero.
+ */
+struct pds_lm_dirty_disable_cmd {
+     u8     opcode;
+     u8     rsvd;
+     __le16 vf_id;
+};
+
+/**
+ * struct pds_lm_dirty_seq_ack_cmd - DIRTY_READ_SEQ or _WRITE_ACK
command
+ * @opcode:  Opcode PDS_LM_CMD_DIRTY_[READ_SEQ|WRITE_ACK]
+ * @rsvd:    Word boundary padding
+ * @vf_id:   VF id
+ * @off_bytes:       Byte offset in the bitmap
+ * @len_bytes:       Number of bytes to transfer
+ * @num_sge: Number of DMA scatter gather elements
+ * @rsvd2:   Word boundary padding
+ * @sgl_addr:        DMA address of scatter gather list
+ *
+ * Read bytes from the SEQ bitmap, or write bytes into the ACK bitmap.
+ *
+ * This command treats the entire bitmap as a byte buffer.  It does not
+ * distinguish between guest memory regions.  The driver should refer to
the
+ * number of pages in each region, according to
PDS_LM_CMD_DIRTY_STATUS, to
+ * determine the region boundaries in the bitmap.  Each region will be
+ * represented by exactly the number of bits as the page count for that
region,
+ * immediately following the last bit of the previous region.
+ */
+struct pds_lm_dirty_seq_ack_cmd {
+     u8     opcode;
+     u8     rsvd;
+     __le16 vf_id;
+     __le32 off_bytes;
+     __le32 len_bytes;
+     __le16 num_sge;
+     u8     rsvd2[2];
+     __le64 sgl_addr;
+} __packed;
+
  /**
   * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command
   * @opcode:  Opcode PDS_LM_CMD_HOST_VF_STATUS
@@ -1039,6 +1212,10 @@ union pds_core_adminq_cmd {
       struct pds_lm_save_cmd            lm_save;
       struct pds_lm_restore_cmd         lm_restore;
       struct pds_lm_host_vf_status_cmd  lm_host_vf_status;
+     struct pds_lm_dirty_status_cmd    lm_dirty_status;
+     struct pds_lm_dirty_enable_cmd    lm_dirty_enable;
+     struct pds_lm_dirty_disable_cmd   lm_dirty_disable;
+     struct pds_lm_dirty_seq_ack_cmd   lm_dirty_seq_ack;
  };

  union pds_core_adminq_comp {
@@ -1065,6 +1242,7 @@ union pds_core_adminq_comp {
       struct pds_vdpa_vq_reset_comp     vdpa_vq_reset;

       struct pds_lm_state_size_comp     lm_state_size;
+     struct pds_lm_dirty_status_comp   lm_dirty_status;
  };

  #ifndef __CHECKER__
--
2.17.1




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux