On 30/8/24 17:02, Xu Yilun wrote:
On Fri, Aug 30, 2024 at 02:00:30PM +1000, Alexey Kardashevskiy wrote:
On 29/8/24 20:08, Xu Yilun wrote:
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 76b7f6085dcd..a4e9db212adc 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
+#include <linux/tsm.h>
#include "vfio.h"
#ifdef CONFIG_SPAPR_TCE_IOMMU
@@ -29,8 +30,14 @@ struct kvm_vfio_file {
#endif
};
+struct kvm_vfio_tdi {
+ struct list_head node;
+ struct vfio_device *vdev;
+};
+
struct kvm_vfio {
struct list_head file_list;
+ struct list_head tdi_list;
struct mutex lock;
bool noncoherent;
};
@@ -80,6 +87,22 @@ static bool kvm_vfio_file_is_valid(struct file *file)
return ret;
}
+static struct vfio_device *kvm_vfio_file_device(struct file *file)
+{
+ struct vfio_device *(*fn)(struct file *file);
+ struct vfio_device *ret;
+
+ fn = symbol_get(vfio_file_device);
+ if (!fn)
+ return NULL;
+
+ ret = fn(file);
+
+ symbol_put(vfio_file_device);
+
+ return ret;
+}
+
#ifdef CONFIG_SPAPR_TCE_IOMMU
static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file)
{
@@ -297,6 +320,103 @@ static int kvm_vfio_set_file(struct kvm_device *dev, long attr,
return -ENXIO;
}
+static int kvm_dev_tsm_bind(struct kvm_device *dev, void __user *arg)
+{
+ struct kvm_vfio *kv = dev->private;
+ struct kvm_vfio_tsm_bind tb;
+ struct kvm_vfio_tdi *ktdi;
+ struct vfio_device *vdev;
+ struct fd fdev;
+ int ret;
+
+ if (copy_from_user(&tb, arg, sizeof(tb)))
+ return -EFAULT;
+
+ ktdi = kzalloc(sizeof(*ktdi), GFP_KERNEL_ACCOUNT);
+ if (!ktdi)
+ return -ENOMEM;
+
+ fdev = fdget(tb.devfd);
+ if (!fdev.file)
+ return -EBADF;
+
+ ret = -ENOENT;
+
+ mutex_lock(&kv->lock);
+
+ vdev = kvm_vfio_file_device(fdev.file);
+ if (vdev) {
+ ret = kvm_arch_tsm_bind(dev->kvm, vdev->dev, tb.guest_rid);
+ if (!ret) {
+ ktdi->vdev = vdev;
+ list_add_tail(&ktdi->node, &kv->tdi_list);
+ } else {
+ vfio_put_device(vdev);
+ }
+ }
+
+ fdput(fdev);
+ mutex_unlock(&kv->lock);
+ if (ret)
+ kfree(ktdi);
+
+ return ret;
+}
+
+static int kvm_dev_tsm_unbind(struct kvm_device *dev, void __user *arg)
+{
+ struct kvm_vfio *kv = dev->private;
+ struct kvm_vfio_tsm_bind tb;
+ struct kvm_vfio_tdi *ktdi;
+ struct vfio_device *vdev;
+ struct fd fdev;
+ int ret;
+
+ if (copy_from_user(&tb, arg, sizeof(tb)))
+ return -EFAULT;
+
+ fdev = fdget(tb.devfd);
+ if (!fdev.file)
+ return -EBADF;
+
+ ret = -ENOENT;
+
+ mutex_lock(&kv->lock);
+
+ vdev = kvm_vfio_file_device(fdev.file);
+ if (vdev) {
+ list_for_each_entry(ktdi, &kv->tdi_list, node) {
+ if (ktdi->vdev != vdev)
+ continue;
+
+ kvm_arch_tsm_unbind(dev->kvm, vdev->dev);
+ list_del(&ktdi->node);
+ kfree(ktdi);
+ vfio_put_device(vdev);
+ ret = 0;
+ break;
+ }
+ vfio_put_device(vdev);
+ }
+
+ fdput(fdev);
+ mutex_unlock(&kv->lock);
+ return ret;
+}
+
+static int kvm_vfio_set_device(struct kvm_device *dev, long attr,
+ void __user *arg)
+{
+ switch (attr) {
+ case KVM_DEV_VFIO_DEVICE_TDI_BIND:
+ return kvm_dev_tsm_bind(dev, arg);
I think the TDI bind operation should be under the control of the device
owner (i.e. VFIO driver), rather than in this bridge driver.
This is a valid point, although this means teaching VFIO about the KVM
lifetime (and KVM already holds references to VFIO groups) and
Not sure if I understand, VFIO already knows KVM lifetime via
vfio_device_get_kvm_safe(), is it?
Yeah you're right.
guest BDFns (which have no meaning for VFIO in the host kernel).
KVM is not aware of the guest BDF today.
I think we need to pass a firmware recognizable TDI identifier, which
is actually a magic number and specific to vendors. For TDX, it is the
FUNCTION_ID. So I didn't think too much to whom the identifier is
meaningful.
It needs to be the same id for "bind" operation (bind a TDI to a VM, the
op performed by QEMU) and GUEST_REQUEST (VMGEXIT from the VM so the id
comes from the guest). The host kernel is not going to parse it but just
pass to the firmware so I guess it can be just an u32.
The TDI bind
means TDI would be transitioned to CONFIG_LOCKED state, and a bunch of
device configurations breaks the state (TDISP spec 11.4.5/8/9). So the
VFIO driver should be fully aware of the TDI bind and manage unwanted
breakage.
VFIO has no control over TDI any way, cannot even know what state it is in
without talking to the firmware.
I think VFIO could talk to the firmware, that's part of the reason we are
working on the TSM module independent to KVM.
When TDI goes into ERROR, this needs to be
propagated to the VM. At the moment (afaik) it does not tell the
I assume when TDISP ERROR happens, an interrupt (e.g. AER) would be sent
to OS and VFIO driver is the one who handles it in the first place. So
maybe there has to be some TDI stuff in VFIO?
Sounds reasonable, my test device just does not do this so I have not
poked at the error handling much :) Thanks,
Thanks,
Yilun
userspace/guest about IOMMU errors and it probably should but the existing
mechanism should be able to do so. Thanks,
Thanks,
Yilun
--
Alexey
--
Alexey