On Tue, Mar 01, 2011 at 02:20:02PM +0200, Michael S. Tsirkin wrote: > On Tue, Mar 01, 2011 at 02:10:37PM +0800, Sheng Yang wrote: > > On Monday 28 February 2011 19:27:29 Michael S. Tsirkin wrote: > > > On Mon, Feb 28, 2011 at 03:20:04PM +0800, Sheng Yang wrote: > > > > Then we can support mask bit operation of assigned devices now. > > > > > > > > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx> > > > > > > A general question: we implement mmio read and write > > > operation here, but seem to do nothing > > > about ordering. In particular, pci mmio reads are normally > > > assumed to flush all writes out to the device > > > and all device writes in to the CPU. > > > > > > How exactly does it work here? > > > > I don't understand your problem... We emulate all operation here, where is > > "ordering" issue? > > > > And Michael, thanks for you detail comments, but could you give comments all at > > once? Now I've tried my best to address comments, but still feeling endless > > comments are coming. > > Hmm, sorry about that. At least some of the comments are in > the new code, so I could not have commented on it earlier ... > E.g. the ext_data thing only appeared in the latest version > or the one before that. In some cases such as the non-standard > error codes used, I don't always udnerstand the logic, as > the error handling is switched to standard conventions > so comments appear as the logic becomes apparent. This > applies to EFAULT handling below. > > > And I would leave Intel this Friday, I want to get it done > > before I leave. > > Permanently? > I think it would be helpful, in that case, if you publish some > testing data detailing how best to test the patch, > which hardware etc. We will then do my best to carry on, fix up > the remaining nits if any, test and apply the patch. Yes. I would relocate to California, and work for another company start next month. Since the patch is already v11 now, if there is not big logic issue, I really want to get it done before I leave. So I would still try my best to address all comments and get it checked in ASAP. > > > > > > > > --- > > > > > > > > arch/x86/include/asm/kvm_host.h | 1 + > > > > arch/x86/kvm/Makefile | 2 +- > > > > arch/x86/kvm/mmu.c | 2 + > > > > arch/x86/kvm/x86.c | 40 ++++- > > > > include/linux/kvm.h | 28 ++++ > > > > include/linux/kvm_host.h | 36 +++++ > > > > virt/kvm/assigned-dev.c | 44 ++++++ > > > > virt/kvm/kvm_main.c | 38 +++++- > > > > virt/kvm/msix_mmio.c | 301 > > > > +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h > > > > | 25 ++++ > > > > 10 files changed, 504 insertions(+), 13 deletions(-) > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > diff --git a/arch/x86/include/asm/kvm_host.h > > > > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644 > > > > --- a/arch/x86/include/asm/kvm_host.h > > > > +++ b/arch/x86/include/asm/kvm_host.h > > > > @@ -635,6 +635,7 @@ enum emulation_result { > > > > > > > > EMULATE_DONE, /* no further processing */ > > > > EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ > > > > EMULATE_FAIL, /* can't emulate this instruction */ > > > > > > > > + EMULATE_USERSPACE_EXIT, /* we need exit to userspace */ > > > > > > > > }; > > > > > > > > #define EMULTYPE_NO_DECODE (1 << 0) > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > index f15501f..3a0d851 100644 > > > > --- a/arch/x86/kvm/Makefile > > > > +++ b/arch/x86/kvm/Makefile > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > - assigned-dev.o) > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > > > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, > > > > async_pf.o) > > > > > > > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > > > > index 9cafbb4..912dca4 100644 > > > > --- a/arch/x86/kvm/mmu.c > > > > +++ b/arch/x86/kvm/mmu.c > > > > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t > > > > cr2, u32 error_code, > > > > > > > > case EMULATE_DO_MMIO: > > > > ++vcpu->stat.mmio_exits; > > > > /* fall through */ > > > > > > > > + case EMULATE_USERSPACE_EXIT: > > > > + /* fall through */ > > > > > > > > case EMULATE_FAIL: > > > > return 0; > > > > > > > > default: > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > index 21b84e2..87308eb 100644 > > > > --- a/arch/x86/kvm/x86.c > > > > +++ b/arch/x86/kvm/x86.c > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > case KVM_CAP_XSAVE: > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > + case KVM_CAP_MSIX_MMIO: > > > > r = 1; > > > > break; > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned > > > > long addr, > > > > > > > > { > > > > > > > > gpa_t gpa; > > > > struct kvm_io_ext_data ext_data; > > > > > > > > + int r; > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > @@ -3824,18 +3826,32 @@ static int > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > mmio: > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data); > > > > > > > > /* > > > > > > > > * Is this MMIO handled locally? > > > > */ > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data)) > > > > + if (!r) > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > - vcpu->mmio_needed = 1; > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > > > > - vcpu->run->mmio.len = vcpu->mmio_size = bytes; > > > > - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > > > > - memcpy(vcpu->run->mmio.data, val, bytes); > > > > + if (r == -ENOTSYNC) { > > > > > > Can you replace -ENOTSYNC with KVM_EXIT_MSIX_ROUTING_UPDATE all over > > > please? > > > > How about let Avi/Marcelo decide? > > They are the ones that decide anyway :) > It's prettier though, if you have the time it'd be nice to > fix it up directly. OK, I would work out another version. > > > > > > > > + vcpu->userspace_exit_needed = 1; > > > > + vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE; > > > > + vcpu->run->msix_routing.dev_id = > > > > + ext_data.msix_routing.dev_id; > > > > + vcpu->run->msix_routing.type = > > > > + ext_data.msix_routing.type; > > > > + vcpu->run->msix_routing.entry_idx = > > > > + ext_data.msix_routing.entry_idx; > > > > + vcpu->run->msix_routing.flags = > > > > + ext_data.msix_routing.flags; > > > > + } else { > > > > + vcpu->mmio_needed = 1; > > > > + vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > > > > + vcpu->run->mmio.len = vcpu->mmio_size = bytes; > > > > + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > > > > + memcpy(vcpu->run->mmio.data, val, bytes); > > > > + } > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > } > > > > > > > > @@ -4469,6 +4485,8 @@ done: > > > > r = EMULATE_DO_MMIO; > > > > > > > > } else if (r == EMULATION_RESTART) > > > > > > > > goto restart; > > > > > > > > + else if (vcpu->userspace_exit_needed) > > > > + r = EMULATE_USERSPACE_EXIT; > > > > > > > > else > > > > > > > > r = EMULATE_DONE; > > > > > > > > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu > > > > *vcpu, struct kvm_run *kvm_run) > > > > > > > > } > > > > > > > > } > > > > > > > > - if (vcpu->arch.pio.count || vcpu->mmio_needed) { > > > > + if (vcpu->arch.pio.count || vcpu->mmio_needed || > > > > + vcpu->userspace_exit_needed) { > > > > > > > > if (vcpu->mmio_needed) { > > > > > > > > memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); > > > > vcpu->mmio_read_completed = 1; > > > > vcpu->mmio_needed = 0; > > > > > > > > } > > > > > > > > + if (vcpu->userspace_exit_needed) { > > > > + vcpu->userspace_exit_needed = 0; > > > > + r = 0; > > > > + goto out; > > > > + } > > > > > > > > vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); > > > > r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); > > > > srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); > > > > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > > > index ea2dc1a..4393e4e 100644 > > > > --- a/include/linux/kvm.h > > > > +++ b/include/linux/kvm.h > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config { > > > > > > > > #define KVM_EXIT_NMI 16 > > > > #define KVM_EXIT_INTERNAL_ERROR 17 > > > > #define KVM_EXIT_OSI 18 > > > > > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 > > > > > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > > > #define KVM_INTERNAL_ERROR_EMULATION 1 > > > > > > > > @@ -264,6 +265,13 @@ struct kvm_run { > > > > > > > > struct { > > > > > > > > __u64 gprs[32]; > > > > > > > > } osi; > > > > > > > > + /* KVM_EXIT_MSIX_ROUTING_UPDATE*/ > > > > + struct { > > > > + __u32 dev_id; > > > > + __u16 type; > > > > + __u16 entry_idx; > > > > + __u64 flags; > > > > + } msix_routing; > > > > > > > > /* Fix the size of the union. */ > > > > char padding[256]; > > > > > > > > }; > > > > > > > > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo { > > > > > > > > #define KVM_CAP_PPC_GET_PVINFO 57 > > > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > > > > #define KVM_CAP_ASYNC_PF 59 > > > > > > > > +#define KVM_CAP_MSIX_MMIO 60 > > > > > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > > > > > @@ -672,6 +681,9 @@ struct kvm_clock_data { > > > > > > > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > > > kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, > > > > 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK > > > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > > > > > > > > +/* Available with KVM_CAP_MSIX_MMIO */ > > > > +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, > > > > 0x7e, struct kvm_msix_mmio_user) > > > > > > > > /* Available with KVM_CAP_PIT_STATE2 */ > > > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, > > > > struct kvm_pit_state2) > > > > > > > > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry { > > > > > > > > __u16 padding[3]; > > > > > > > > }; > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) > > > > + > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) > > > > + > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 > > > > +struct kvm_msix_mmio_user { > > > > + __u32 dev_id; > > > > + __u16 type; > > > > + __u16 max_entries_nr; > > > > + __u64 base_addr; > > > > + __u64 base_va; > > > > + __u64 flags; > > > > + __u64 reserved[4]; > > > > +}; > > > > + > > > > > > > > #endif /* __LINUX_KVM_H */ > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > > index a32c53e..d6c05e3 100644 > > > > --- a/include/linux/kvm_host.h > > > > +++ b/include/linux/kvm_host.h > > > > @@ -68,8 +68,17 @@ enum kvm_bus { > > > > > > > > KVM_NR_BUSES > > > > > > > > }; > > > > > > > > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING 1 > > > > > > > > struct kvm_io_ext_data { > > > > > > > > int type; /* Extended data type */ > > > > > > type seems unused for now. Do we need it? > > > I'd guess exit type should usually be sufficient. > > > > I want to get this structure self-explained. > > > > > > > + union { > > > > + struct { > > > > + u32 dev_id; /* Device ID */ > > > > + u16 type; /* Device type */ > > > > + u16 entry_idx; /* Accessed MSI-X entry index */ > > > > + u64 flags; > > > > + } msix_routing; > > > > > > So this is only for when we exit on msix routing? > > > > Yes for now. > > > > > > > + }; > > > > > > > > }; > > > > > > > > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, > > > > > > > > @@ -165,6 +174,8 @@ struct kvm_vcpu { > > > > > > > > } async_pf; > > > > > > > > #endif > > > > > > > > + int userspace_exit_needed; > > > > + > > > > > > > > struct kvm_vcpu_arch arch; > > > > > > > > }; > > > > > > > > @@ -238,6 +249,27 @@ struct kvm_memslots { > > > > > > > > KVM_PRIVATE_MEM_SLOTS]; > > > > > > > > }; > > > > > > > > +#define KVM_MSIX_MMIO_MAX 32 > > > > + > > > > +struct kvm_msix_mmio { > > > > + u32 dev_id; > > > > + u16 type; > > > > + u16 max_entries_nr; > > > > + u64 flags; > > > > + gpa_t table_base_addr; > > > > + hva_t table_base_va; > > > > > > void __user* would make for less casting in code. > > > > OK. > > > > > > Or even > > > > > > struct kvm_msix_entry { > > > __le32 addr_lo; > > > __le32 addr_hi; > > > __le32 data; > > > __le32 ctrl; > > > }; > > > > > > struct kvm_msix_entry __user *table; > > > > > > and then we can do table[entry].ctrl > > > and so on. > > > > It's a userspace address, I think use it in this way maybe misleading. Personally > > I want to keep the current way. > > > > > > > + gpa_t pba_base_addr; > > > > + hva_t pba_base_va; > > > > +}; > > > > + > > > > +struct kvm_msix_mmio_dev { > > > > + struct kvm *kvm; > > > > + struct kvm_io_device table_dev; > > > > + int mmio_nr; > > > > + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; > > > > + struct mutex lock; > > > > +}; > > > > + > > > > > > > > struct kvm { > > > > > > > > spinlock_t mmu_lock; > > > > raw_spinlock_t requests_lock; > > > > > > > > @@ -286,6 +318,7 @@ struct kvm { > > > > > > > > long mmu_notifier_count; > > > > > > > > #endif > > > > > > > > long tlbs_dirty; > > > > > > > > + struct kvm_msix_mmio_dev msix_mmio_dev; > > > > > > > > }; > > > > > > > > /* The guest did something we don't support. */ > > > > > > > > @@ -558,6 +591,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, > > > > > > > > int kvm_request_irq_source_id(struct kvm *kvm); > > > > void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > + int assigned_dev_id, int entry, bool mask); > > > > + > > > > > > > > /* For vcpu->arch.iommu_flags */ > > > > #define KVM_IOMMU_CACHE_COHERENCY 0x1 > > > > > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > > > index ae72ae6..d1598a6 100644 > > > > --- a/virt/kvm/assigned-dev.c > > > > +++ b/virt/kvm/assigned-dev.c > > > > @@ -18,6 +18,7 @@ > > > > > > > > #include <linux/interrupt.h> > > > > #include <linux/slab.h> > > > > #include "irq.h" > > > > > > > > +#include "msix_mmio.h" > > > > > > > > static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct > > > > list_head *head, > > > > > > > > int assigned_dev_id) > > > > > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm, > > > > > > > > kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); > > > > > > > > } > > > > > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm, > > > > + struct kvm_assigned_dev_kernel *adev) > > > > +{ > > > > + struct kvm_msix_mmio mmio; > > > > + > > > > + mmio.dev_id = adev->assigned_dev_id; > > > > + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > + kvm_free_msix_mmio(kvm, &mmio); > > > > +} > > > > + > > > > > > > > static void kvm_free_assigned_device(struct kvm *kvm, > > > > > > > > struct kvm_assigned_dev_kernel > > > > *assigned_dev) > > > > > > > > { > > > > > > > > kvm_free_assigned_irq(kvm, assigned_dev); > > > > > > > > + assigned_device_free_msix_mmio(kvm, assigned_dev); > > > > + > > > > > > > > __pci_reset_function(assigned_dev->dev); > > > > pci_restore_state(assigned_dev->dev); > > > > > > > > @@ -785,3 +799,33 @@ out: > > > > return r; > > > > > > > > } > > > > > > > > +/* The caller should hold kvm->lock */ > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > + int assigned_dev_id, int entry, bool mask) > > > > +{ > > > > + int r = -EFAULT; > > > > + struct kvm_assigned_dev_kernel *adev; > > > > + int i; > > > > + > > > > + if (!irqchip_in_kernel(kvm)) > > > > + return r; > > > > > > This is wrong, EFAULT should only be returned on > > > bad/misaligned address. > > > > > > Can we check this during setup instead? > > > And then this check can go away or become > > > BUG_ON, and the function be void. > > > > I think BUG_ON is OK. > > > > > > > + > > > > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > > > + assigned_dev_id); > > > > + if (!adev) > > > > + goto out; > > > > + > > > > + /* For non-MSIX enabled devices, entries_nr == 0 */ > > > > + for (i = 0; i < adev->entries_nr; i++) > > > > + if (adev->host_msix_entries[i].entry == entry) { > > > > + if (mask) > > > > + disable_irq_nosync( > > > > + adev->host_msix_entries[i].vector); > > > > + else > > > > + enable_irq(adev->host_msix_entries[i].vector); > > > > + r = 0; > > > > + break; > > > > + } > > > > +out: > > > > + return r; > > > > +} > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > index a61f90e..f211e49 100644 > > > > --- a/virt/kvm/kvm_main.c > > > > +++ b/virt/kvm/kvm_main.c > > > > @@ -56,6 +56,7 @@ > > > > > > > > #include "coalesced_mmio.h" > > > > #include "async_pf.h" > > > > > > > > +#include "msix_mmio.h" > > > > > > > > #define CREATE_TRACE_POINTS > > > > #include <trace/events/kvm.h> > > > > > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) > > > > > > > > struct mm_struct *mm = kvm->mm; > > > > > > > > kvm_arch_sync_events(kvm); > > > > > > > > + kvm_unregister_msix_mmio_dev(kvm); > > > > > > > > spin_lock(&kvm_lock); > > > > list_del(&kvm->vm_list); > > > > spin_unlock(&kvm_lock); > > > > > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, > > > > > > > > mutex_unlock(&kvm->lock); > > > > break; > > > > > > > > #endif > > > > > > > > + case KVM_REGISTER_MSIX_MMIO: { > > > > + struct kvm_msix_mmio_user mmio_user; > > > > + > > > > + r = -EFAULT; > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > + goto out; > > > > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); > > > > + break; > > > > + } > > > > + case KVM_UNREGISTER_MSIX_MMIO: { > > > > + struct kvm_msix_mmio_user mmio_user; > > > > + > > > > + r = -EFAULT; > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > + goto out; > > > > + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); > > > > + break; > > > > + } > > > > > > > > default: > > > > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > > > > if (r == -ENOTTY) > > > > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) > > > > > > > > return r; > > > > > > > > } > > > > > > > > #endif > > > > > > > > + r = kvm_register_msix_mmio_dev(kvm); > > > > + if (r < 0) { > > > > + kvm_put_kvm(kvm); > > > > + return r; > > > > + } > > > > + > > > > > > Need to fix error handling below as well? > > > Better do it with chained gotos if yes. > > > > Let's make it another separate patch. > > Well you add a new failure mode, you need to cleanup > properly ... > > > > > > > > r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); > > > > if (r < 0) > > > > > > > > kvm_put_kvm(kvm); > > > > > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus > > > > *bus) > > > > > > > > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, > > > > > > > > int len, const void *val, struct kvm_io_ext_data *ext_data) > > > > > > > > { > > > > > > > > - int i; > > > > + int i, r = -EOPNOTSUPP; > > > > > > > > struct kvm_io_bus *bus; > > > > > > > > bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); > > > > > > > > - for (i = 0; i < bus->dev_count; i++) > > > > - if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data)) > > > > + for (i = 0; i < bus->dev_count; i++) { > > > > + r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data); > > > > + if (r == -ENOTSYNC) > > > > + break; > > > > + else if (!r) > > > > > > > > return 0; > > > > > > > > - return -EOPNOTSUPP; > > > > + } > > > > + return r; > > > > > > > > } > > > > > > > > /* kvm_io_bus_read - called under kvm->slots_lock */ > > > > > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c > > > > new file mode 100644 > > > > index 0000000..b2a5f86 > > > > --- /dev/null > > > > +++ b/virt/kvm/msix_mmio.c > > > > @@ -0,0 +1,301 @@ > > > > +/* > > > > + * MSI-X MMIO emulation > > > > + * > > > > + * Copyright (c) 2010 Intel Corporation > > > > + * > > > > + * This work is licensed under the terms of the GNU GPL, version 2. See > > > > + * the COPYING file in the top-level directory. > > > > + * > > > > + * Author: > > > > + * Sheng Yang <sheng.yang@xxxxxxxxx> > > > > + */ > > > > + > > > > +#include <linux/kvm_host.h> > > > > +#include <linux/kvm.h> > > > > + > > > > +#include "msix_mmio.h" > > > > +#include "iodev.h" > > > > + > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio > > > > *mmio, + int entry, u32 flag) > > > > +{ > > > > + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > > > Does something else even happen? > > > If not - BUG_ON. > > > > Not for now, the next would be your vfio? Would use BUG_ON for now. > > > > > > > + return kvm_assigned_device_update_msix_mask_bit(kvm, > > > > + mmio->dev_id, entry, flag); > > > > > > > > + return -EFAULT; > > > > > > EINVAL or something > > > > OK > > > > > > > +} > > > > + > > > > +/* Caller must hold dev->lock */ > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, > > > > + gpa_t addr, int len) > > > > +{ > > > > + gpa_t start, end; > > > > + int i, r = -EINVAL; > > > > + > > > > + for (i = 0; i < dev->mmio_nr; i++) { > > > > + start = dev->mmio[i].table_base_addr; > > > > + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * > > > > + dev->mmio[i].max_entries_nr; > > > > + if (addr >= start && addr + len <= end) { > > > > + r = i; > > > > + break; > > > > + } > > > > + } > > > > + > > > > + return r; > > > > +} > > > > + > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, > > > > int len, + void *val) > > > > +{ > > > > + /*TODO: Add big endian support */ > > > > > > likely can be removed. > > > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > + struct kvm_msix_mmio *mmio; > > > > + int idx, ret = 0, entry, offset, r; > > > > + > > > > + mutex_lock(&mmio_dev->lock); > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > + if (idx < 0) { > > > > + ret = -EOPNOTSUPP; > > > > + goto out; > > > > + } > > > > + if ((len != 4 && len != 8) || addr & (len - 1)) > > > > + goto out; > > > > + > > > > + offset = addr % PCI_MSIX_ENTRY_SIZE; > > > > + mmio = &mmio_dev->mmio[idx]; > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > > > > cast above is not needed. > > > > > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > > + if (r) > > > > + goto out; > > > > +out: > > > > + mutex_unlock(&mmio_dev->lock); > > > > + return ret; > > > > +} > > > > + > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, > > > > + int len, const void *val, > > > > + struct kvm_io_ext_data *ext_data) > > > > +{ > > > > + /*TODO: Add big endian support */ > > > > > > comment above can go I think. > > > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > + struct kvm_msix_mmio *mmio; > > > > + int idx, entry, offset, ret = 0, r = 0; > > > > + void __user *entry_base; > > > > + __le32 __user *ctrl_pos; > > > > + __le32 old_ctrl, new_ctrl; > > > > + > > > > + mutex_lock(&mmio_dev->kvm->lock); > > > > + mutex_lock(&mmio_dev->lock); > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > + if (idx < 0) { > > > > + ret = -EOPNOTSUPP; > > > > + goto out; > > > > + } > > > > + if ((len != 4 && len != 8) || addr & (len - 1)) > > > > + goto out; > > > > + > > > > + offset = addr % PCI_MSIX_ENTRY_SIZE; > > > > + > > > > + mmio = &mmio_dev->mmio[idx]; > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > + entry_base = (void __user *)(mmio->table_base_va + > > > > + entry * PCI_MSIX_ENTRY_SIZE); > > > > + ctrl_pos = entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL; > > > > + > > > > + if (get_user(old_ctrl, ctrl_pos)) > > > > + goto out; > > > > + > > > > + /* Don't allow writing to other fields when entry is unmasked */ > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > > This should be le32_to_cpu(old_ctrl) > > > I think sparse shall warn about this. > > > > Sadly it didn't. > > > > > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > > + goto out; > > > > + > > > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > > > > > > A cast above is no longer needed > > > > > > > + goto out; > > > > + > > > > + ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING; > > > > + ext_data->msix_routing.dev_id = mmio->dev_id; > > > > + ext_data->msix_routing.type = mmio->type; > > > > + ext_data->msix_routing.entry_idx = entry; > > > > + ext_data->msix_routing.flags = 0; > > > > > > Is this strictly necessary? > > > Can we fill the data in vcpu->run->msix_routing > > > directly instead? > > > Also, on good path we don't need to fill this structure > > > as there's no exit. Let's only do it then? > > > > Where could you get the "entry_idx"? > > It's only used to pass the exit info to userspace, isn't that right? Yes. That's the reason we need this, suggested by Avi. > > > > > > > > + > > > > + if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) { > > > > + ret = -ENOTSYNC; > > > > + goto out; > > > > + } > > > > + > > > > + if (get_user(new_ctrl, ctrl_pos)) > > > > + goto out; > > > > + > > > > + if (old_ctrl == new_ctrl) { > > > > + if (offset == PCI_MSIX_ENTRY_DATA && len == 8) > > > > + ret = -ENOTSYNC; > > > > + goto out; > > > > + } > > > > + if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ > > > > + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, > > > > + !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)); > > > > > > So if update_msix_mask_bit returns EFAULT above this will > > > trigger msix exit? Why? > > > > This means kernel didn't handle the mask bit, so we need let userspace handle it - > > for enable the vectors, or others. > > I think the problem is the misuse of EFAULT here. > EFAULT should be for bad userspace addresses only. > Do you use EFAULT instead ENOTSUP, or something? ENOTSUPP is OK. -- regards Yang, Sheng > > > > > > > > + if (r) > > > > + ret = -ENOTSYNC; > > > > +out: > > > > + mutex_unlock(&mmio_dev->lock); > > > > + mutex_unlock(&mmio_dev->kvm->lock); > > > > + return ret; > > > > +} > > > > + > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = { > > > > + .read = msix_table_mmio_read, > > > > + .write = msix_table_mmio_write, > > > > +}; > > > > + > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm) > > > > +{ > > > > + int ret; > > > > + > > > > + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops); > > > > + mutex_init(&kvm->msix_mmio_dev.lock); > > > > + kvm->msix_mmio_dev.kvm = kvm; > > > > + mutex_lock(&kvm->slots_lock); > > > > + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > > > > + &kvm->msix_mmio_dev.table_dev); > > > > + mutex_unlock(&kvm->slots_lock); > > > > + return ret; > > > > +} > > > > + > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) > > > > +{ > > > > + int ret; > > > > + > > > > + mutex_lock(&kvm->slots_lock); > > > > + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > > > + &kvm->msix_mmio_dev.table_dev); > > > > + mutex_unlock(&kvm->slots_lock); > > > > + return ret; > > > > +} > > > > + > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > + struct kvm_msix_mmio *mmio = NULL; > > > > + int r = 0, i; > > > > + > > > > + mutex_lock(&mmio_dev->lock); > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > > > > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > > > > + mmio = &mmio_dev->mmio[i]; > > > > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + break; > > > > + } > > > > + } > > > > > > what does the loop above do? Pls add a comment. > > > > Find the existed MMIO, I think code is clear here. > > > > > > > + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > > > > + r = -EINVAL; > > > > > > -ENOSPC here as well? > > > > OK > > > > > > > + goto out; > > > > + } > > > > + /* All reserved currently */ > > > > + if (mmio_user->flags) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + > > > > +#ifndef CONFIG_64BIT > > > > + if (mmio_user->base_va >= 0xffffffff || > > > > + mmio_user->base_addr >= 0xffffffff) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > +#endif > > > > > > A way to check this without ifdef (long is as pointer in length): > > > if ((unsigned long)mmio_user->base_va != > > > mmio_user->base_va) > > > > > > and for base_va it should be EFAULT I think as with any illegal > > > virtual address. > > > > OK. > > > > > > > + > > > > + /* Check alignment and accessibility */ > > > > + if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) || > > > > + !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va, > > > > + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { > > > > > > One thing to watch for here is wrap around. AFAIK > > > access_ok does not check for it, you must do it yourself. > > > > I am curious that if so, every access_ok should go with a wrap around checking? > > > > > > > > > + r = -EINVAL; > > > > > > EFAULT is usually better for an access error. > > > > OK. > > > > > > > + goto out; > > > > + } > > > > + if (!mmio) { > > > > + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { > > > > + r = -ENOSPC; > > > > + goto out; > > > > + } > > > > + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; > > > > + mmio_dev->mmio_nr++; > > > > + } > > > > + > > > > + mmio->max_entries_nr = mmio_user->max_entries_nr; > > > > + mmio->dev_id = mmio_user->dev_id; > > > > + mmio->flags = mmio_user->flags; > > > > + > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > + mmio->table_base_addr = mmio_user->base_addr; > > > > + mmio->table_base_va = mmio_user->base_va; > > > > + } > > > > +out: > > > > + mutex_unlock(&mmio_dev->lock); > > > > + return r; > > > > +} > > > > + > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio) > > > > > > Pls add a comment documenting what this does. > > > > You mean code is too confusing here? I don't think so. > > > > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > + int r = -EINVAL, i, j; > > > > + > > > > + if (!mmio) > > > > + return 0; > > > > + > > > > + mutex_lock(&mmio_dev->lock); > > > > + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && > > > > + mmio_dev->mmio[i].type == mmio->type) { > > > > + r = 0; > > > > + for (j = i; j < mmio_dev->mmio_nr - 1; j++) > > > > + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; > > > > + mmio_dev->mmio_nr--; > > > > + break; > > > > + } > > > > + } > > > > + mutex_unlock(&mmio_dev->lock); > > > > + return r; > > > > +} > > > > + > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > +{ > > > > + struct kvm_msix_mmio mmio; > > > > + > > > > + mmio.dev_id = mmio_user->dev_id; > > > > + mmio.type = mmio_user->type; > > > > + > > > > > > So you pass in kvm_msix_mmio but only 2 fields are > > > valid? This is very confusing. Please just pass the > > > two values as function params to kvm_free_msix_mmio. > > > > OK. > > > > > > > + return kvm_free_msix_mmio(kvm, &mmio); > > > > +} > > > > + > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h > > > > new file mode 100644 > > > > index 0000000..01b6587 > > > > --- /dev/null > > > > +++ b/virt/kvm/msix_mmio.h > > > > @@ -0,0 +1,25 @@ > > > > +#ifndef __KVM_MSIX_MMIO_H__ > > > > +#define __KVM_MSIX_MMIO_H__ > > > > +/* > > > > + * MSI-X MMIO emulation > > > > + * > > > > + * Copyright (c) 2010 Intel Corporation > > > > + * > > > > + * This work is licensed under the terms of the GNU GPL, version 2. See > > > > + * the COPYING file in the top-level directory. > > > > + * > > > > + * Author: > > > > + * Sheng Yang <sheng.yang@xxxxxxxxx> > > > > + */ > > > > + > > > > +#include <linux/pci.h> > > > > > > I don't see where you use this include. > > > OTOH you do need to declare struct kvm_msix_mmio_user and > > > struct kvm here. > > > > Yes. > > > > -- > > regards > > Yang, Sheng > > > > > > > > > + > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm); > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio > > > > *mmio_user); + > > > > +#endif -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html