[RFC 29/29] vfio/vgpu_mgr: introduce NVIDIA vGPU VFIO variant driver

Zhi Wang <zhiw@xxxxxxxxxx> · Sun, 22 Sep 2024 05:49:51 -0700

A VFIO variant driver module is designed to extend the capabilities of
the existing VFIO (Virtual Function I/O), offering device management
interfaces to the userspace and advanced feature support.

For the userspace to use the NVIDIA vGPU, a new vGPU VFIO variant driver
is introduced to provide vGPU management, like selecting/creating vGPU
instance, support advance features like live migration.

Introduce the NVIDIA vGPU VFIO variant driver to support vGPU lifecycle
management UABI and the future advancd features.

Cc: Neo Jia <cjia@xxxxxxxxxx>
Cc: Surath Mitra <smitra@xxxxxxxxxx>
Cc: Kirti Wankhede <kwankhede@xxxxxxxxxx>
Cc: Vinay Kabra <vkabra@xxxxxxxxxx>
Cc: Ankit Agrawal <ankita@xxxxxxxxxx>
Signed-off-by: Zhi Wang <zhiw@xxxxxxxxxx>
---
 drivers/vfio/pci/nvidia-vgpu/Makefile      |   3 +
 drivers/vfio/pci/nvidia-vgpu/vfio.h        |  43 ++
 drivers/vfio/pci/nvidia-vgpu/vfio_access.c | 297 ++++++++++++
 drivers/vfio/pci/nvidia-vgpu/vfio_main.c   | 511 +++++++++++++++++++++
 drivers/vfio/pci/nvidia-vgpu/vgpu.c        |  22 +
 drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h    |   2 +-
 6 files changed, 877 insertions(+), 1 deletion(-)
 create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio.h
 create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio_access.c
 create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio_main.c

diff --git a/drivers/vfio/pci/nvidia-vgpu/Makefile b/drivers/vfio/pci/nvidia-vgpu/Makefile
index fade9d49df97..99c47e2f436d 100644
--- a/drivers/vfio/pci/nvidia-vgpu/Makefile
+++ b/drivers/vfio/pci/nvidia-vgpu/Makefile
@@ -3,3 +3,6 @@ ccflags-y += -I$(srctree)/$(src)/include
 
 obj-$(CONFIG_NVIDIA_VGPU_MGR) += nvidia-vgpu-mgr.o
 nvidia-vgpu-mgr-y := vgpu_mgr.o vgpu.o vgpu_types.o rpc.o
+
+obj-$(CONFIG_NVIDIA_VGPU_VFIO_PCI) += nvidia-vgpu-vfio-pci.o
+nvidia-vgpu-vfio-pci-y := vfio_main.o vfio_access.o
diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio.h b/drivers/vfio/pci/nvidia-vgpu/vfio.h
new file mode 100644
index 000000000000..fa6bbf81552d
--- /dev/null
+++ b/drivers/vfio/pci/nvidia-vgpu/vfio.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright © 2024 NVIDIA Corporation
+ */
+
+#ifndef _NVIDIA_VGPU_VFIO_H__
+#define _NVIDIA_VGPU_VFIO_H__
+
+#include <linux/vfio_pci_core.h>
+
+#include <nvrm/nvtypes.h>
+#include <nvrm/common/sdk/nvidia/inc/ctrl/ctrla081.h>
+#include <nvrm/common/sdk/nvidia/inc/ctrl/ctrl2080/ctrl2080vgpumgrinternal.h>
+
+#include "vgpu_mgr.h"
+
+#define VGPU_CONFIG_PARAMS_MAX_LENGTH 1024
+#define DEVICE_CLASS_LENGTH 5
+#define PCI_CONFIG_SPACE_LENGTH 4096
+
+#define CAP_LIST_NEXT_PTR_MSIX 0x7c
+#define MSIX_CAP_SIZE   0xc
+
+struct nvidia_vgpu_vfio {
+	struct vfio_pci_core_device core_dev;
+	u8 config_space[PCI_CONFIG_SPACE_LENGTH];
+
+	void __iomem *bar0_map;
+
+	u8 **vgpu_types;
+	NVA081_CTRL_VGPU_INFO *curr_vgpu_type;
+	u32 num_vgpu_types;
+
+	struct nvidia_vgpu_mgr *vgpu_mgr;
+	struct nvidia_vgpu *vgpu;
+};
+
+void nvidia_vgpu_vfio_setup_config(struct nvidia_vgpu_vfio *nvdev);
+ssize_t nvidia_vgpu_vfio_access(struct nvidia_vgpu_vfio *nvdev,
+				char __user *buf, size_t count,
+				loff_t ppos, bool iswrite);
+
+#endif /* _NVIDIA_VGPU_VFIO_H__ */
diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio_access.c b/drivers/vfio/pci/nvidia-vgpu/vfio_access.c
new file mode 100644
index 000000000000..320c72a07dbe
--- /dev/null
+++ b/drivers/vfio/pci/nvidia-vgpu/vfio_access.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright © 2024 NVIDIA Corporation
+ */
+
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+
+#include "vfio.h"
+
+void nvidia_vgpu_vfio_setup_config(struct nvidia_vgpu_vfio *nvdev)
+{
+	u8 *buffer = NULL;
+
+	memset(nvdev->config_space, 0, sizeof(nvdev->config_space));
+
+	/* Header type 0 (normal devices) */
+	*(u16 *)&nvdev->config_space[PCI_VENDOR_ID] = 0x10de;
+	*(u16 *)&nvdev->config_space[PCI_DEVICE_ID] =
+		FIELD_GET(GENMASK(31, 16), nvdev->curr_vgpu_type->vdevId);
+	*(u16 *)&nvdev->config_space[PCI_COMMAND] = 0x0000;
+	*(u16 *)&nvdev->config_space[PCI_STATUS] = 0x0010;
+
+	buffer = &nvdev->config_space[PCI_CLASS_REVISION];
+	pci_read_config_byte(nvdev->core_dev.pdev, PCI_CLASS_REVISION, buffer);
+
+	nvdev->config_space[PCI_CLASS_PROG] = 0; /* VGA-compatible */
+	nvdev->config_space[PCI_CLASS_DEVICE] = 0; /* VGA controller */
+	nvdev->config_space[PCI_CLASS_DEVICE + 1] = 3; /* display controller */
+
+	/* BAR0: 32-bit */
+	*(u32 *)&nvdev->config_space[PCI_BASE_ADDRESS_0] = 0x00000000;
+	/* BAR1: 64-bit, prefetchable */
+	*(u32 *)&nvdev->config_space[PCI_BASE_ADDRESS_1] = 0x0000000c;
+	/* BAR2: 64-bit, prefetchable */
+	*(u32 *)&nvdev->config_space[PCI_BASE_ADDRESS_3] = 0x0000000c;
+	/* Disable BAR3: I/O */
+	*(u32 *)&nvdev->config_space[PCI_BASE_ADDRESS_5] = 0x00000000;
+
+	*(u16 *)&nvdev->config_space[PCI_SUBSYSTEM_VENDOR_ID] = 0x10de;
+	*(u16 *)&nvdev->config_space[PCI_SUBSYSTEM_ID] =
+		FIELD_GET(GENMASK(15, 0), nvdev->curr_vgpu_type->vdevId);
+
+	nvdev->config_space[PCI_CAPABILITY_LIST] = CAP_LIST_NEXT_PTR_MSIX;
+	nvdev->config_space[CAP_LIST_NEXT_PTR_MSIX + 1] = 0x0;
+
+	/* INTx disabled */
+	nvdev->config_space[0x3d] = 0;
+}
+
+static void read_hw_pci_config(struct pci_dev *pdev, char *buf,
+			       size_t count, loff_t offset)
+{
+	switch (count) {
+	case 4:
+		pci_read_config_dword(pdev, offset, (u32 *)buf);
+		break;
+
+	case 2:
+		pci_read_config_word(pdev, offset, (u16 *)buf);
+		break;
+
+	case 1:
+		pci_read_config_byte(pdev, offset, (u8 *)buf);
+		break;
+	default:
+		WARN_ONCE(1, "Not supported access len\n");
+		break;
+	}
+}
+
+static void write_hw_pci_config(struct pci_dev *pdev, char *buf,
+				size_t count, loff_t offset)
+{
+	switch (count) {
+	case 4:
+		pci_write_config_dword(pdev, offset, *(u32 *)buf);
+		break;
+
+	case 2:
+		pci_write_config_word(pdev, offset, *(u16 *)buf);
+		break;
+
+	case 1:
+		pci_write_config_byte(pdev, offset, *(u8 *)buf);
+		break;
+	default:
+		WARN_ONCE(1, "Not supported access len\n");
+		break;
+	}
+}
+
+static void hw_pci_config_rw(struct pci_dev *pdev, char *buf,
+			     size_t count, loff_t offset,
+			     bool is_write)
+{
+	is_write ? write_hw_pci_config(pdev, buf, count, offset) :
+		   read_hw_pci_config(pdev, buf, count, offset);
+}
+
+static ssize_t bar0_rw(struct nvidia_vgpu_vfio *nvdev, char *buf,
+		       size_t count, loff_t ppos, bool iswrite)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	int index = VFIO_PCI_OFFSET_TO_INDEX(ppos);
+	loff_t offset = ppos;
+	void __iomem *map;
+	u32 val;
+	int ret;
+
+	if (index != VFIO_PCI_BAR0_REGION_INDEX)
+		return -EINVAL;
+
+	offset &= VFIO_PCI_OFFSET_MASK;
+
+	if (nvdev->bar0_map == NULL) {
+		ret = pci_request_selected_regions(pdev, 1 << index, "nvidia-vgpu-vfio");
+		if (ret)
+			return ret;
+
+		if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) {
+			pci_release_selected_regions(pdev, 1 << index);
+			return -EIO;
+		}
+
+		map = ioremap(pci_resource_start(pdev, index), pci_resource_len(pdev, index));
+		if (!map) {
+			pci_err(pdev, "Can't map BAR0 MMIO space\n");
+			pci_release_selected_regions(pdev, 1 << index);
+			return -ENOMEM;
+		}
+		nvdev->bar0_map = map;
+	} else
+		map = nvdev->bar0_map;
+
+	if (!iswrite) {
+		switch (count) {
+		case 4:
+			val = ioread32(map + offset);
+			break;
+		case 2:
+			val = ioread16(map + offset);
+			break;
+		case 1:
+			val = ioread8(map + offset);
+			break;
+		}
+		memcpy(buf, (u8 *)&val, count);
+	} else {
+		switch (count) {
+		case 4:
+			iowrite32(*(u32 *)buf, map + offset);
+			break;
+		case 2:
+			iowrite16(*(u16 *)buf, map + offset);
+			break;
+		case 1:
+			iowrite8(*(u8 *)buf, map + offset);
+			break;
+		}
+	}
+	return count;
+}
+
+static ssize_t pci_config_rw(struct nvidia_vgpu_vfio *nvdev, char *buf,
+			     size_t count, loff_t ppos, bool iswrite)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	int index = VFIO_PCI_OFFSET_TO_INDEX(ppos);
+	loff_t offset = ppos;
+	u32 bar_mask, cfg_addr;
+	u32 val = 0;
+
+	if (index != VFIO_PCI_CONFIG_REGION_INDEX)
+		return -EINVAL;
+
+	offset &= VFIO_PCI_OFFSET_MASK;
+
+	if ((offset >= CAP_LIST_NEXT_PTR_MSIX) && (offset <
+				(CAP_LIST_NEXT_PTR_MSIX + MSIX_CAP_SIZE))) {
+		hw_pci_config_rw(pdev, buf, count, offset, iswrite);
+		return count;
+	}
+
+	if (!iswrite) {
+		memcpy(buf, (u8 *)&nvdev->config_space[offset], count);
+
+		switch (offset) {
+		case PCI_COMMAND:
+			hw_pci_config_rw(pdev, (char *)&val, count, offset, iswrite);
+
+			switch (count) {
+			case 4:
+				val = (u32)(val & 0xFFFF0000) | (val &
+					(PCI_COMMAND_PARITY | PCI_COMMAND_SERR));
+				break;
+			case 2:
+				val = (val & (PCI_COMMAND_PARITY | PCI_COMMAND_SERR));
+				break;
+			default:
+				WARN_ONCE(1, "Not supported access len\n");
+				break;
+			}
+			break;
+		case PCI_STATUS:
+			hw_pci_config_rw(pdev, (char *)&val, count, offset, iswrite);
+			break;
+
+		default:
+			break;
+		}
+		*(u32 *)buf = *(u32 *)buf | val;
+	} else {
+		switch (offset) {
+		case PCI_VENDOR_ID:
+		case PCI_DEVICE_ID:
+		case PCI_CAPABILITY_LIST:
+			break;
+
+		case PCI_STATUS:
+			hw_pci_config_rw(pdev, buf, count, offset, iswrite);
+			break;
+
+		case PCI_COMMAND:
+			if (count == 4) {
+				val = (u32)((*(u32 *)buf & 0xFFFF0000) >> 16);
+				hw_pci_config_rw(pdev, (char *)&val, 2, PCI_STATUS, iswrite);
+
+				val = (u32)(*(u32 *)buf & 0x0000FFFF);
+				*(u32 *)buf = val;
+			}
+
+			memcpy((u8 *)&nvdev->config_space[offset], buf, count);
+			break;
+
+		case PCI_BASE_ADDRESS_0:
+		case PCI_BASE_ADDRESS_1:
+		case PCI_BASE_ADDRESS_2:
+		case PCI_BASE_ADDRESS_3:
+		case PCI_BASE_ADDRESS_4:
+			cfg_addr = *(u32 *)buf;
+
+			switch (offset) {
+			case PCI_BASE_ADDRESS_0:
+				bar_mask = (u32)((~(pci_resource_len(pdev, VFIO_PCI_BAR0_REGION_INDEX)) + 1) & ~0xFul);
+				cfg_addr = (cfg_addr & bar_mask) | (nvdev->config_space[offset] & 0xFul);
+				break;
+			case PCI_BASE_ADDRESS_1:
+				bar_mask = (u32)((~(nvdev->curr_vgpu_type->bar1Length * 1024 * 1024) + 1) & ~0xFul);
+				cfg_addr = (cfg_addr & bar_mask) | (nvdev->config_space[offset] & 0xFul);
+				break;
+
+			case PCI_BASE_ADDRESS_2:
+				bar_mask = (u32)(((~(nvdev->curr_vgpu_type->bar1Length * 1024 * 1024) + 1) & ~0xFul) >> 32);
+				cfg_addr = (cfg_addr & bar_mask);
+				break;
+
+			case PCI_BASE_ADDRESS_3:
+				bar_mask = (u32)((~(pci_resource_len(pdev, VFIO_PCI_BAR3_REGION_INDEX)) + 1) & ~0xFul);
+				cfg_addr = (cfg_addr & bar_mask) | (nvdev->config_space[offset] & 0xFul);
+				break;
+
+			case PCI_BASE_ADDRESS_4:
+				bar_mask = (u32)(((~(pci_resource_len(pdev, VFIO_PCI_BAR3_REGION_INDEX)) + 1) & ~0xFul) >> 32);
+				cfg_addr = (cfg_addr & bar_mask);
+				break;
+			}
+			*(u32 *)&nvdev->config_space[offset] = cfg_addr;
+			break;
+		default:
+			break;
+
+		}
+	}
+	return count;
+}
+
+ssize_t nvidia_vgpu_vfio_access(struct nvidia_vgpu_vfio *nvdev, char *buf,
+				size_t count, loff_t ppos, bool iswrite)
+{
+	int index = VFIO_PCI_OFFSET_TO_INDEX(ppos);
+
+	if (index >= VFIO_PCI_NUM_REGIONS)
+		return -EINVAL;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		return pci_config_rw(nvdev, buf, count, ppos,
+				     iswrite);
+	case VFIO_PCI_BAR0_REGION_INDEX:
+		return bar0_rw(nvdev, buf, count, ppos, iswrite);
+	default:
+		return -EINVAL;
+	}
+	return count;
+}
diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio_main.c b/drivers/vfio/pci/nvidia-vgpu/vfio_main.c
new file mode 100644
index 000000000000..667ed6fb48f6
--- /dev/null
+++ b/drivers/vfio/pci/nvidia-vgpu/vfio_main.c
@@ -0,0 +1,511 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright © 2024 NVIDIA Corporation
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/types.h>
+
+#include "vfio.h"
+
+static int pdev_to_gfid(struct pci_dev *pdev)
+{
+	return pci_iov_vf_id(pdev) + 1;
+}
+
+static int destroy_vgpu(struct nvidia_vgpu_vfio *nvdev)
+{
+	int ret;
+
+	ret = nvidia_vgpu_mgr_destroy_vgpu(nvdev->vgpu);
+	if (ret)
+		return ret;
+
+	kfree(nvdev->vgpu);
+	nvdev->vgpu = NULL;
+	return 0;
+}
+
+static int create_vgpu(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	struct nvidia_vgpu *vgpu;
+	int ret;
+
+	vgpu = kzalloc(sizeof(*vgpu), GFP_KERNEL);
+	if (!vgpu)
+		return -ENOMEM;
+
+	vgpu->info.id = pci_iov_vf_id(pdev);
+	vgpu->info.dbdf = (0 << 16) | pci_dev_id(pdev);
+	vgpu->info.gfid = pdev_to_gfid(pdev);
+
+	vgpu->vgpu_mgr = vgpu_mgr;
+	vgpu->pdev = pdev;
+
+	ret = nvidia_vgpu_mgr_create_vgpu(vgpu,
+			(u8 *)nvdev->curr_vgpu_type);
+	if (ret) {
+		kfree(vgpu);
+		return ret;
+	}
+
+	pr_err("create_vgpu() called\n");
+	nvdev->vgpu = vgpu;
+	return 0;
+}
+
+static inline struct vfio_pci_core_device *
+vdev_to_core_dev(struct vfio_device *vdev)
+{
+	return container_of(vdev, struct vfio_pci_core_device, vdev);
+}
+
+static inline struct nvidia_vgpu_vfio *
+core_dev_to_nvdev(struct vfio_pci_core_device *core_dev)
+{
+	return container_of(core_dev, struct nvidia_vgpu_vfio, core_dev);
+}
+
+static void detach_vgpu_mgr(struct nvidia_vgpu_vfio *nvdev)
+{
+	nvidia_vgpu_mgr_put(nvdev->vgpu_mgr);
+
+	nvdev->vgpu_mgr = NULL;
+	nvdev->vgpu_types = NULL;
+	nvdev->num_vgpu_types = 0;
+}
+
+static int attach_vgpu_mgr(struct nvidia_vgpu_vfio *nvdev,
+			   struct pci_dev *pdev)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr;
+
+	vgpu_mgr = nvidia_vgpu_mgr_get(pdev);
+	if (IS_ERR(vgpu_mgr))
+		return PTR_ERR(vgpu_mgr);
+
+	nvdev->vgpu_mgr = vgpu_mgr;
+	nvdev->vgpu_types = nvdev->vgpu_mgr->vgpu_types;
+	nvdev->num_vgpu_types = nvdev->vgpu_mgr->num_vgpu_types;
+
+	return 0;
+}
+
+static NVA081_CTRL_VGPU_INFO *
+find_vgpu_type(struct nvidia_vgpu_vfio *nvdev, u32 type_id)
+{
+	NVA081_CTRL_VGPU_INFO *vgpu_type;
+	u32 i;
+
+	for (i = 0; i < nvdev->num_vgpu_types; i++) {
+		vgpu_type = (NVA081_CTRL_VGPU_INFO *)nvdev->vgpu_types[i];
+		if (vgpu_type->vgpuType == type_id)
+			return vgpu_type;
+	}
+
+	return NULL;
+}
+
+static int
+nvidia_vgpu_vfio_open_device(struct vfio_device *vdev)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+	u64 pf_dma_mask;
+	int ret;
+
+	if (!nvdev->curr_vgpu_type)
+		return -ENODEV;
+
+	if (!pdev->physfn)
+		return -EINVAL;
+
+	ret = create_vgpu(nvdev);
+	if (ret)
+		return ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret)
+		goto err_enable_device;
+
+	pci_set_master(pdev);
+
+	pf_dma_mask = dma_get_mask(&pdev->physfn->dev);
+	dma_set_mask(&pdev->dev, pf_dma_mask);
+	dma_set_coherent_mask(&pdev->dev, pf_dma_mask);
+
+	ret = pci_try_reset_function(pdev);
+	if (ret)
+		goto err_reset_function;
+
+	ret = nvidia_vgpu_mgr_enable_bme(nvdev->vgpu);
+	if (ret)
+		goto err_enable_bme;
+
+	return 0;
+
+err_enable_bme:
+err_reset_function:
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+err_enable_device:
+	destroy_vgpu(nvdev);
+	return ret;
+}
+
+static void
+nvidia_vgpu_vfio_close_device(struct vfio_device *vdev)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+
+	WARN_ON(destroy_vgpu(nvdev));
+
+	if (nvdev->bar0_map) {
+		iounmap(nvdev->bar0_map);
+		pci_release_selected_regions(pdev, 1 << 0);
+		nvdev->bar0_map = NULL;
+	}
+
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+}
+
+static int
+get_region_info(struct vfio_pci_core_device *core_dev, unsigned long arg)
+{
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+	struct vfio_region_info info;
+	unsigned long minsz;
+	int ret = 0;
+
+	minsz = offsetofend(struct vfio_region_info, offset);
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EINVAL;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	switch (info.index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = PCI_CONFIG_SPACE_LENGTH;
+		info.flags = VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE;
+		break;
+
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR4_REGION_INDEX:
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = pci_resource_len(pdev, info.index);
+
+		if (info.index == VFIO_PCI_BAR1_REGION_INDEX)
+			info.size = nvdev->curr_vgpu_type->bar1Length * 1024 * 1024;
+
+		if (!info.size) {
+			info.flags = 0;
+			break;
+		}
+		info.flags = VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE |
+			VFIO_REGION_INFO_FLAG_MMAP;
+
+		if (caps.size) {
+			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+							sizeof(info), caps.buf,
+							caps.size)) {
+					kfree(caps.buf);
+					ret = -EFAULT;
+					break;
+				}
+				info.cap_offset = sizeof(info);
+			}
+			kfree(caps.buf);
+		}
+		break;
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+		info.size = 0;
+		break;
+
+	default:
+		if (info.index >= VFIO_PCI_NUM_REGIONS)
+			ret = -EINVAL;
+		break;
+	}
+
+	if (!ret)
+		ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+
+	return ret;
+}
+
+static long nvidia_vgpu_vfio_ioctl(struct vfio_device *vdev,
+				   unsigned int cmd,
+				   unsigned long arg)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	int ret = 0;
+
+	if (!nvdev->curr_vgpu_type)
+		return -ENODEV;
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_REGION_INFO:
+		ret = get_region_info(core_dev, arg);
+		break;
+	case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
+	case VFIO_DEVICE_PCI_HOT_RESET:
+	case VFIO_DEVICE_RESET:
+		break;
+
+	default:
+		ret = vfio_pci_core_ioctl(vdev, cmd, arg);
+		break;
+	}
+
+	return ret;
+}
+
+static ssize_t nvidia_vgpu_vfio_read(struct vfio_device *vdev,
+				     char __user *buf, size_t count,
+				     loff_t *ppos)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	u64 val;
+	size_t done = 0;
+	int ret = 0, size;
+
+	if (!nvdev->curr_vgpu_type)
+		return -ENODEV;
+
+	while (count) {
+		if (count >= 4 && !(*ppos % 4))
+			size = 4;
+		else if (count >= 2 && !(*ppos % 2))
+			size = 2;
+		else
+			size = 1;
+
+		ret = nvidia_vgpu_vfio_access(nvdev, (char *)&val, size, *ppos, false);
+
+		if (ret <= 0)
+			return ret;
+
+		if (copy_to_user(buf, &val, size) != 0)
+			return -EFAULT;
+
+		*ppos += size;
+		buf += size;
+		count -= size;
+		done += size;
+	}
+
+	return done;
+}
+
+static ssize_t nvidia_vgpu_vfio_write(struct vfio_device *vdev,
+				      const char __user *buf, size_t count,
+				      loff_t *ppos)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	u64 val;
+	size_t done = 0;
+	int ret = 0, size;
+
+	if (!nvdev->curr_vgpu_type)
+		return -ENODEV;
+
+	while (count) {
+		if (count >= 4 && !(*ppos % 4))
+			size = 4;
+		else if (count >= 2 && !(*ppos % 2))
+			size = 2;
+		else
+			size = 1;
+
+		if (copy_from_user(&val, buf, size) != 0)
+			return -EFAULT;
+
+		ret = nvidia_vgpu_vfio_access(nvdev, (char *)&val, size, *ppos, true);
+
+		if (ret <= 0)
+			return ret;
+
+		*ppos += size;
+		buf += size;
+		count -= size;
+		done += size;
+	}
+
+	return done;
+}
+
+static int nvidia_vgpu_vfio_mmap(struct vfio_device *vdev,
+				 struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+	u64 phys_len, req_len, pgoff, req_start;
+	unsigned int index;
+
+	if (!nvdev->curr_vgpu_type)
+		return -ENODEV;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (index >= VFIO_PCI_BAR5_REGION_INDEX)
+		return -EINVAL;
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	if (req_len == 0)
+		return -EINVAL;
+
+	if ((req_start + req_len > phys_len) || (phys_len == 0))
+		return -EINVAL;
+
+	vma->vm_private_data = vdev;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, req_len, vma->vm_page_prot);
+}
+
+static const struct vfio_device_ops nvidia_vgpu_vfio_ops = {
+	.name           = "nvidia-vgpu-vfio-pci",
+	.init		= vfio_pci_core_init_dev,
+	.release	= vfio_pci_core_release_dev,
+	.open_device    = nvidia_vgpu_vfio_open_device,
+	.close_device   = nvidia_vgpu_vfio_close_device,
+	.ioctl          = nvidia_vgpu_vfio_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read           = nvidia_vgpu_vfio_read,
+	.write          = nvidia_vgpu_vfio_write,
+	.mmap           = nvidia_vgpu_vfio_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static int setup_vgpu_type(struct nvidia_vgpu_vfio *nvdev)
+{
+	nvdev->curr_vgpu_type = find_vgpu_type(nvdev, 869);
+	if (!nvdev->curr_vgpu_type)
+		return -ENODEV;
+	return 0;
+}
+
+static int nvidia_vgpu_vfio_probe(struct pci_dev *pdev,
+				  const struct pci_device_id *id_table)
+{
+	struct nvidia_vgpu_vfio *nvdev;
+	int ret;
+
+	if (!pdev->is_virtfn)
+		return -EINVAL;
+
+	nvdev = vfio_alloc_device(nvidia_vgpu_vfio, core_dev.vdev,
+				  &pdev->dev, &nvidia_vgpu_vfio_ops);
+	if (IS_ERR(nvdev))
+		return PTR_ERR(nvdev);
+
+	ret = attach_vgpu_mgr(nvdev, pdev);
+	if (ret)
+		goto err_attach_vgpu_mgr;
+
+	ret = setup_vgpu_type(nvdev);
+	if (ret)
+		goto err_setup_vgpu_type;
+
+	nvidia_vgpu_vfio_setup_config(nvdev);
+
+	dev_set_drvdata(&pdev->dev, &nvdev->core_dev);
+
+	ret = vfio_pci_core_register_device(&nvdev->core_dev);
+	if (ret)
+		goto err_setup_vgpu_type;
+
+	return 0;
+
+err_setup_vgpu_type:
+	detach_vgpu_mgr(nvdev);
+
+err_attach_vgpu_mgr:
+	vfio_put_device(&nvdev->core_dev.vdev);
+
+	pci_err(pdev, "VF probe failed with ret: %d\n", ret);
+	return ret;
+}
+
+static void nvidia_vgpu_vfio_remove(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_dev = dev_get_drvdata(&pdev->dev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+
+	vfio_pci_core_unregister_device(core_dev);
+	detach_vgpu_mgr(nvdev);
+	vfio_put_device(&core_dev->vdev);
+}
+
+struct pci_device_id nvidia_vgpu_vfio_table[] = {
+	{
+		.vendor      = PCI_VENDOR_ID_NVIDIA,
+		.device      = PCI_ANY_ID,
+		.subvendor   = PCI_ANY_ID,
+		.subdevice   = PCI_ANY_ID,
+		.class       = (PCI_CLASS_DISPLAY_3D << 8),
+		.class_mask  = ~0,
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, nvidia_vgpu_vfio_table);
+
+struct pci_driver nvidia_vgpu_vfio_driver = {
+	.name               = "nvidia-vgpu-vfio",
+	.id_table           = nvidia_vgpu_vfio_table,
+	.probe              = nvidia_vgpu_vfio_probe,
+	.remove             = nvidia_vgpu_vfio_remove,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(nvidia_vgpu_vfio_driver);
+
+MODULE_LICENSE("Dual MIT/GPL");
+MODULE_AUTHOR("Vinay Kabra <vkabra@xxxxxxxxxx>");
+MODULE_AUTHOR("Kirti Wankhede <kwankhede@xxxxxxxxxx>");
+MODULE_AUTHOR("Zhi Wang <zhiw@xxxxxxxxxx>");
+MODULE_DESCRIPTION("NVIDIA vGPU VFIO Variant Driver - User Level driver for NVIDIA vGPU");
diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu.c b/drivers/vfio/pci/nvidia-vgpu/vgpu.c
index 93d27db30a41..003ca116b4a8 100644
--- a/drivers/vfio/pci/nvidia-vgpu/vgpu.c
+++ b/drivers/vfio/pci/nvidia-vgpu/vgpu.c
@@ -328,3 +328,25 @@ int nvidia_vgpu_mgr_create_vgpu(struct nvidia_vgpu *vgpu, u8 *vgpu_type)
 	return ret;
 }
 EXPORT_SYMBOL(nvidia_vgpu_mgr_create_vgpu);
+
+static int update_bme_state(struct nvidia_vgpu *vgpu)
+{
+	NV_VGPU_CPU_RPC_DATA_UPDATE_BME_STATE params = {0};
+
+	params.enable = true;
+
+	return nvidia_vgpu_rpc_call(vgpu, NV_VGPU_CPU_RPC_MSG_UPDATE_BME_STATE,
+				    &params, sizeof(params));
+}
+
+/**
+ * nvidia_vgpu_enable_bme - handle BME sequence
+ * @vf: the vGPU instance
+ *
+ * Returns: 0 on success, others on failure.
+ */
+int nvidia_vgpu_mgr_enable_bme(struct nvidia_vgpu *vgpu)
+{
+	return update_bme_state(vgpu);
+}
+EXPORT_SYMBOL(nvidia_vgpu_mgr_enable_bme);
diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h
index af922d8e539c..2c9e0eebcb99 100644
--- a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h
+++ b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h
@@ -84,6 +84,6 @@ int nvidia_vgpu_rpc_call(struct nvidia_vgpu *vgpu, u32 msg_type,
 void nvidia_vgpu_clean_rpc(struct nvidia_vgpu *vgpu);
 int nvidia_vgpu_setup_rpc(struct nvidia_vgpu *vgpu);
 
-int nvidia_vgpu_mgr_reset_vgpu(struct nvidia_vgpu *vgpu);
+int nvidia_vgpu_mgr_enable_bme(struct nvidia_vgpu *vgpu);
 
 #endif
-- 
2.34.1