On 11/24/2014 09:56 PM, Alex Williamson wrote: > On Sun, 2014-11-23 at 19:35 +0100, Eric Auger wrote: >> This patch introduces a new KVM_DEV_VFIO_DEVICE group. >> >> This is a new control channel which enables KVM to cooperate with >> viable VFIO devices. >> >> Functions are introduced to check the validity of a VFIO device >> file descriptor, increment/decrement the ref counter of the VFIO >> device. >> >> The patch introduces 2 attributes for this new device group: >> KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ. >> Their purpose is to turn a VFIO device IRQ into a forwarded IRQ and >> unset respectively unset the feature. >> >> The VFIO device stores a list of registered forwarded IRQs. The reference >> counter of the device is incremented each time a new IRQ is forwarded. >> Reference counter is decremented when the IRQ forwarding is unset. >> >> The forwarding programmming is architecture specific, implemented in >> kvm_arch_set_fwd_state function. Architecture specific implementation is >> enabled when __KVM_HAVE_ARCH_KVM_VFIO_FORWARD is set. When not set those >> functions are void. >> >> Signed-off-by: Eric Auger <eric.auger@xxxxxxxxxx> >> >> --- >> >> v2 -> v3: >> - add API comments in kvm_host.h >> - improve the commit message >> - create a private kvm_vfio_fwd_irq struct >> - fwd_irq_action replaced by a bool and removal of VFIO_IRQ_CLEANUP. This >> latter action will be handled in vgic. >> - add a vfio_device handle argument to kvm_arch_set_fwd_state. The goal is >> to move platform specific stuff in architecture specific code. >> - kvm_arch_set_fwd_state renamed into kvm_arch_vfio_set_forward >> - increment the ref counter each time we do an IRQ forwarding and decrement >> this latter each time one IRQ forward is unset. Simplifies the whole >> ref counting. >> - simplification of list handling: create, search, removal >> >> v1 -> v2: >> - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD >> - original patch file separated into 2 parts: generic part moved in vfio.c >> and ARM specific part(kvm_arch_set_fwd_state) >> --- >> include/linux/kvm_host.h | 28 ++++++ >> virt/kvm/vfio.c | 249 ++++++++++++++++++++++++++++++++++++++++++++++- >> 2 files changed, 274 insertions(+), 3 deletions(-) >> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index ea53b04..0b9659d 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -1076,6 +1076,15 @@ struct kvm_device_ops { >> unsigned long arg); >> }; >> >> +/* internal self-contained structure describing a forwarded IRQ */ >> +struct kvm_fwd_irq { >> + struct kvm *kvm; /* VM to inject the GSI into */ >> + struct vfio_device *vdev; /* vfio device the IRQ belongs to */ >> + __u32 index; /* VFIO device IRQ index */ >> + __u32 subindex; /* VFIO device IRQ subindex */ >> + __u32 gsi; /* gsi, ie. virtual IRQ number */ >> +}; >> + >> void kvm_device_get(struct kvm_device *dev); >> void kvm_device_put(struct kvm_device *dev); >> struct kvm_device *kvm_device_from_filp(struct file *filp); >> @@ -1085,6 +1094,25 @@ void kvm_unregister_device_ops(u32 type); >> extern struct kvm_device_ops kvm_mpic_ops; >> extern struct kvm_device_ops kvm_xics_ops; >> >> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD >> +/** >> + * kvm_arch_vfio_set_forward - changes the forwarded state of an IRQ >> + * >> + * @fwd_irq: handle to the forwarded irq struct >> + * @forward: true means forwarded, false means not forwarded >> + * returns 0 on success, < 0 on failure >> + */ >> +int kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq, >> + bool forward); > > We could add a struct device* to the args list or into struct > kvm_fwd_irq so that arch code doesn't need to touch the vdev. arch code > has no business dealing with references to the vfio_device. Hi Alex, Currently It can't put struct device* into the kvm_fwd_irq struct since I need to release the vfio_device with vfio_device_put_external_user(struct vfio_device *vdev) typically in kvm_vfio_clean_fwd_irq. So I need to store the pointers to the vfio_device somewhere. I see 2 solutions: change the proto of vfio_device_put_external_user(struct vfio_device *vdev) and pass a struct device* (??) or change the proto of kvm_arch_vfio_set_forward into kvm_arch_vfio_set_forward(struct kvm *kvm, struct device *dev, int index, [int subindex], int gsi, bool forward) or using index/start/count but loosing the interest of having a self-contained internal struct. > >> + >> +#else >> +static inline int kvm_arch_vfio_set_forward(struct kvm_fwd_irq *fwd_irq, >> + bool forward) >> +{ >> + return 0; >> +} >> +#endif >> + >> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT >> >> static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) >> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c >> index 6f0cc34..af178bb 100644 >> --- a/virt/kvm/vfio.c >> +++ b/virt/kvm/vfio.c >> @@ -25,8 +25,16 @@ struct kvm_vfio_group { >> struct vfio_group *vfio_group; >> }; >> >> +/* private linkable kvm_fwd_irq struct */ >> +struct kvm_vfio_fwd_irq_node { >> + struct list_head link; >> + struct kvm_fwd_irq fwd_irq; >> +}; >> + >> struct kvm_vfio { >> struct list_head group_list; >> + /* list of registered VFIO forwarded IRQs */ >> + struct list_head fwd_node_list; >> struct mutex lock; >> bool noncoherent; >> }; >> @@ -247,12 +255,239 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) >> return -ENXIO; >> } >> >> +/** >> + * kvm_vfio_get_vfio_device - Returns a handle to a vfio-device >> + * >> + * Checks it is a valid vfio device and increments its reference counter >> + * @fd: file descriptor of the vfio platform device >> + */ >> +static struct vfio_device *kvm_vfio_get_vfio_device(int fd) >> +{ >> + struct fd f = fdget(fd); >> + struct vfio_device *vdev; >> + >> + if (!f.file) >> + return NULL; > > ERR_PTR(-EINVAL)? > > ie. propagate errors from the point where they're encountered so we > don't need to make up an errno later. yes thanks > >> + vdev = kvm_vfio_device_get_external_user(f.file); >> + fdput(f); >> + return vdev; >> +} >> + >> +/** >> + * kvm_vfio_put_vfio_device: decrements the reference counter of the >> + * vfio platform * device >> + * >> + * @vdev: vfio_device handle to release >> + */ >> +static void kvm_vfio_put_vfio_device(struct vfio_device *vdev) >> +{ >> + kvm_vfio_device_put_external_user(vdev); >> +} >> + >> +/** >> + * kvm_vfio_find_fwd_irq - checks whether a forwarded IRQ already is >> + * registered in the list of forwarded IRQs >> + * >> + * @kv: handle to the kvm-vfio device >> + * @fwd: handle to the forwarded irq struct >> + * In the positive returns the handle to its node in the kvm-vfio >> + * forwarded IRQ list, returns NULL otherwise. >> + * Must be called with kv->lock hold. >> + */ >> +static struct kvm_vfio_fwd_irq_node *kvm_vfio_find_fwd_irq( >> + struct kvm_vfio *kv, >> + struct kvm_fwd_irq *fwd) >> +{ >> + struct kvm_vfio_fwd_irq_node *node; >> + >> + list_for_each_entry(node, &kv->fwd_node_list, link) { >> + if ((node->fwd_irq.index == fwd->index) && >> + (node->fwd_irq.subindex == fwd->subindex) && >> + (node->fwd_irq.vdev == fwd->vdev)) >> + return node; >> + } >> + return NULL; >> +} >> +/** >> + * kvm_vfio_register_fwd_irq - Allocates, populates and registers a >> + * forwarded IRQ >> + * >> + * @kv: handle to the kvm-vfio device >> + * @fwd: handle to the forwarded irq struct >> + * In case of success returns a handle to the new list node, >> + * NULL otherwise. >> + * Must be called with kv->lock hold. >> + */ >> +static struct kvm_vfio_fwd_irq_node *kvm_vfio_register_fwd_irq( >> + struct kvm_vfio *kv, >> + struct kvm_fwd_irq *fwd) >> +{ >> + struct kvm_vfio_fwd_irq_node *node; >> + >> + node = kmalloc(sizeof(*node), GFP_KERNEL); >> + if (!node) >> + return NULL; > > ERR_PTR(-ENOMEM)? > OK >> + >> + node->fwd_irq = *fwd; >> + >> + list_add(&node->link, &kv->fwd_node_list); >> + >> + return node; >> +} >> + >> +/** >> + * kvm_vfio_unregister_fwd_irq - unregisters and frees a forwarded IRQ >> + * >> + * @node: handle to the node struct >> + * Must be called with kv->lock hold. >> + */ >> +static void kvm_vfio_unregister_fwd_irq(struct kvm_vfio_fwd_irq_node *node) >> +{ >> + list_del(&node->link); >> + kfree(node); >> +} >> + >> +/** >> + * kvm_vfio_set_forward - turns a VFIO device IRQ into a forwarded IRQ >> + * @kv: handle to the kvm-vfio device >> + * @fd: file descriptor of the vfio device the IRQ belongs to >> + * @fwd: handle to the forwarded irq struct >> + * >> + * Registers an IRQ as forwarded and calls the architecture specific >> + * implementation of set_forward. In case of operation failure, the IRQ >> + * is unregistered. In case of success, the vfio device ref counter is >> + * incremented. >> + */ >> +static int kvm_vfio_set_forward(struct kvm_vfio *kv, int fd, >> + struct kvm_fwd_irq *fwd) >> +{ >> + int ret; >> + struct kvm_vfio_fwd_irq_node *node = >> + kvm_vfio_find_fwd_irq(kv, fwd); >> + >> + if (node) >> + return -EINVAL; > > I assume you're saving -EBUSY for arch code for the case where the IRQ > is already active? yes. -EBUSY now corresponds to the case where the VFIO signaling is already setup. > >> + node = kvm_vfio_register_fwd_irq(kv, fwd); >> + if (!node) >> + return -ENOMEM; > > if (IS_ERR(node)) > return PTR_ERR(node); > >> + ret = kvm_arch_vfio_set_forward(fwd, true); >> + if (ret < 0) { >> + kvm_vfio_unregister_fwd_irq(node); >> + return ret; >> + } >> + /* increment the ref counter */ >> + kvm_vfio_get_vfio_device(fd); > > Wouldn't it be easier if the reference counting were coupled with the > register/unregister_fwd_irq? ok I'd be tempted to pass your user_fwd_irq > to this function instead of a kvm_fwd_irq. Well in that case I would use kvm_arch_forwarded_irq for both set and unset. Then comes the problem of kvm_vfio_clean_fwd_irq which manipulates only internal structs. >> + return ret; >> +} >> + >> +/** >> + * kvm_vfio_unset_forward - Sets a VFIO device IRQ as non forwarded >> + * @kv: handle to the kvm-vfio device >> + * @fwd: handle to the forwarded irq struct >> + * >> + * Calls the architecture specific implementation of set_forward and >> + * unregisters the IRQ from the forwarded IRQ list. Decrements the vfio >> + * device reference counter. >> + */ >> +static int kvm_vfio_unset_forward(struct kvm_vfio *kv, >> + struct kvm_fwd_irq *fwd) >> +{ >> + int ret; >> + struct kvm_vfio_fwd_irq_node *node = >> + kvm_vfio_find_fwd_irq(kv, fwd); >> + if (!node) >> + return -EINVAL; >> + ret = kvm_arch_vfio_set_forward(fwd, false); > > Whoa, if the unforward fails we continue to undo everything else? How > does userspace cleanup from this? We need a guaranteed shutdown path > for cleanup code, we can never trust userspace to do things in the > correct order. Can we really preclude the user calling unforward with > an active IRQ? Maybe _forward() and _unforward() need to be separate > functions so that 'un' can be made void to indicate it can't fail. If I accept an unforward while the VFIO signaling mechanism is set, the wrong handler will be setup on VFIO driver. So should I put in place a mechanism that changes the VFIO handler for that irq (causing VFIO driver free_irq/request_irq), using another external API function? > >> + kvm_vfio_unregister_fwd_irq(node); >> + >> + /* decrement the ref counter */ >> + kvm_vfio_put_vfio_device(fwd->vdev); > > Again, seems like the unregister should do this. ok > >> + return ret; >> +} >> + >> +static int kvm_vfio_control_irq_forward(struct kvm_device *kdev, long attr, >> + int32_t __user *argp) >> +{ >> + struct kvm_arch_forwarded_irq user_fwd_irq; >> + struct kvm_fwd_irq fwd; >> + struct vfio_device *vdev; >> + struct kvm_vfio *kv = kdev->private; >> + int ret; >> + >> + if (copy_from_user(&user_fwd_irq, argp, sizeof(user_fwd_irq))) >> + return -EFAULT; >> + >> + vdev = kvm_vfio_get_vfio_device(user_fwd_irq.fd); >> + if (IS_ERR(vdev)) { >> + ret = PTR_ERR(vdev); >> + goto out; >> + } >> + >> + fwd.vdev = vdev; >> + fwd.kvm = kdev->kvm; >> + fwd.index = user_fwd_irq.index; >> + fwd.subindex = user_fwd_irq.subindex; >> + fwd.gsi = user_fwd_irq.gsi; >> + >> + switch (attr) { >> + case KVM_DEV_VFIO_DEVICE_FORWARD_IRQ: >> + mutex_lock(&kv->lock); >> + ret = kvm_vfio_set_forward(kv, user_fwd_irq.fd, &fwd); >> + mutex_unlock(&kv->lock); >> + break; >> + case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ: >> + mutex_lock(&kv->lock); >> + ret = kvm_vfio_unset_forward(kv, &fwd); >> + mutex_unlock(&kv->lock); >> + break; >> + } >> +out: >> + kvm_vfio_put_vfio_device(vdev); > > It might add a little extra code, but logically the reference being tied > to the register/unregister feels a bit cleaner than this. Sorry I do not catch your comment here. Since i called kvm_vfio_get_vfio_device at the beg of the function, I need to release at the end of the function, besides the ref counter incr/decr at registration? > >> + return ret; >> +} >> + >> +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg) >> +{ >> + int32_t __user *argp = (int32_t __user *)(unsigned long)arg; >> + int ret; >> + >> + switch (attr) { >> + case KVM_DEV_VFIO_DEVICE_FORWARD_IRQ: >> + case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ: >> + ret = kvm_vfio_control_irq_forward(kdev, attr, argp); >> + break; >> + default: >> + ret = -ENXIO; >> + } >> + return ret; >> +} >> + >> +/** >> + * kvm_vfio_clean_fwd_irq - Unset forwarding state of all >> + * registered forwarded IRQs and free their list nodes. >> + * @kv: kvm-vfio device >> + * >> + * Loop on all registered device/IRQ combos, reset the non forwarded state, >> + * void the lists and release the reference >> + */ >> +static int kvm_vfio_clean_fwd_irq(struct kvm_vfio *kv) >> +{ >> + struct kvm_vfio_fwd_irq_node *node, *tmp; >> + >> + list_for_each_entry_safe(node, tmp, &kv->fwd_node_list, link) { >> + kvm_vfio_unset_forward(kv, &node->fwd_irq); >> + } >> + return 0; > > This shouldn't be able to fail, make it void. see above questioning? But you're right, I am too much virtualization oriented. Currently my cleanup is even done in the virtual interrupt controller when the VM dies because nobody unsets the VFIO signaling. Best Regards Eric > >> +} >> + >> static int kvm_vfio_set_attr(struct kvm_device *dev, >> struct kvm_device_attr *attr) >> { >> switch (attr->group) { >> case KVM_DEV_VFIO_GROUP: >> return kvm_vfio_set_group(dev, attr->attr, attr->addr); >> + case KVM_DEV_VFIO_DEVICE: >> + return kvm_vfio_set_device(dev, attr->attr, attr->addr); >> } >> >> return -ENXIO; >> @@ -268,10 +503,17 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, >> case KVM_DEV_VFIO_GROUP_DEL: >> return 0; >> } >> - >> break; >> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD >> + case KVM_DEV_VFIO_DEVICE: >> + switch (attr->attr) { >> + case KVM_DEV_VFIO_DEVICE_FORWARD_IRQ: >> + case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ: >> + return 0; >> + } >> + break; >> +#endif >> } >> - >> return -ENXIO; >> } >> >> @@ -285,7 +527,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev) >> list_del(&kvg->node); >> kfree(kvg); >> } >> - >> + kvm_vfio_clean_fwd_irq(kv); >> kvm_vfio_update_coherency(dev); >> >> kfree(kv); >> @@ -317,6 +559,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) >> return -ENOMEM; >> >> INIT_LIST_HEAD(&kv->group_list); >> + INIT_LIST_HEAD(&kv->fwd_node_list); >> mutex_init(&kv->lock); >> >> dev->private = kv; > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html