Re: [PATCH 3/3] qemu-kvm: device assignment: emulate MSI-X mask bits

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Fri, 5 Nov 2010 15:54:16 +0200

On Fri, Nov 05, 2010 at 07:02:02PM +0800, Sheng Yang wrote:
> On Friday 05 November 2010 17:05:32 Michael S. Tsirkin wrote:
> > On Fri, Nov 05, 2010 at 11:20:37AM +0800, Sheng Yang wrote:
> > > On Thursday 04 November 2010 17:44:27 Michael S. Tsirkin wrote:
> > > 
> > > Thanks very much for reviewing this! Seems nobody cares about userspace
> > > one before...
> > > 
> > > > On Thu, Nov 04, 2010 at 02:18:21PM +0800, Sheng Yang wrote:
> > > > > This patch emulated MSI-X per vector mask bit on assigned device.
> > > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx>
> > > > > ---
> > > > > 
> > > > >  hw/device-assignment.c |  161
> > > > >  ++++++++++++++++++++++++++++++++++++++++++++++-- 1 files
> > > 
> > > changed, 155
> > > 
> > > > >  insertions(+), 6 deletions(-)
> > > > > 
> > > > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> > > > > index 8a98876..639aa0b 100644
> > > > > --- a/hw/device-assignment.c
> > > > > +++ b/hw/device-assignment.c
> > > > > @@ -62,6 +62,11 @@ static void
> > > > > assigned_dev_load_option_rom(AssignedDevice *dev);
> > > > > 
> > > > >  static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
> > > > > 
> > > > > +static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus,
> > > > > uint8_t devfn) +{
> > > > > +    return (uint32_t)seg << 16 | (uint32_t)bus << 8 |
> > > > > (uint32_t)devfn; +}
> > > > > +
> > > > > 
> > > > >  static uint32_t assigned_dev_ioport_rw(AssignedDevRegion
> > > > >  *dev_region,
> > > > >  
> > > > >                                         uint32_t addr, int len,
> > > > >                                         uint32_t *val)
> > > > >  
> > > > >  {
> > > > > 
> > > > > @@ -264,6 +269,9 @@ static void assigned_dev_iomem_map(PCIDevice
> > > > > *pci_dev, int region_num,
> > > > > 
> > > > >      AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> > > > >      PCIRegion *real_region =
> > > > >      &r_dev->real_device.regions[region_num]; int ret = 0;
> > > > > 
> > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > +    struct kvm_assigned_msix_mmio msix_mmio;
> > > > > +#endif
> > > > > 
> > > > >      DEBUG("e_phys=%08" FMT_PCIBUS " r_virt=%p type=%d len=%08"
> > > > >      FMT_PCIBUS " region_num=%d \n",
> > > > >      
> > > > >            e_phys, region->u.r_virtbase, type, e_size, region_num);
> > > > > 
> > > > > @@ -282,6 +290,16 @@ static void assigned_dev_iomem_map(PCIDevice
> > > > > *pci_dev, int region_num,
> > > > > 
> > > > >              cpu_register_physical_memory(e_phys + offset,
> > > > >              
> > > > >                      TARGET_PAGE_SIZE, r_dev->mmio_index);
> > > > > 
> > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > +	    memset(&msix_mmio, 0, sizeof(struct kvm_assigned_msix_mmio));
> > > > > +	    msix_mmio.assigned_dev_id =
> > > > > calc_assigned_dev_id(r_dev->h_segnr, +			    r_dev->h_busnr,
> > > > > r_dev->h_devfn);
> > > > > +	    msix_mmio.base_addr = e_phys + offset;
> > > > > +            /* We required kernel MSI-X support */
> > > > > +	    ret = kvm_assign_reg_msix_mmio(kvm_context, &msix_mmio);
> > > > > +	    if (ret)
> > > > > +                fprintf(stderr, "fail to register in-kernel
> > > > > msix_mmio!\n"); +#endif
> > > > > 
> > > > >          }
> > > > >      
> > > > >      }
> > > > > 
> > > > > @@ -824,11 +842,6 @@ static void free_assigned_device(AssignedDevice
> > > > > *dev)
> > > > > 
> > > > >      }
> > > > >  
> > > > >  }
> > > > > 
> > > > > -static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus,
> > > > > uint8_t devfn) -{
> > > > > -    return (uint32_t)seg << 16 | (uint32_t)bus << 8 |
> > > > > (uint32_t)devfn; -}
> > > > > -
> > > > > 
> > > > >  static void assign_failed_examine(AssignedDevice *dev)
> > > > >  {
> > > > >  
> > > > >      char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
> > > > > 
> > > > > @@ -1123,6 +1136,27 @@ static int
> > > > > get_msix_entries_max_nr(AssignedDevice *adev)
> > > > > 
> > > > >      return entries_max_nr;
> > > > >  
> > > > >  }
> > > > > 
> > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > +static int assigned_dev_msix_entry_masked(AssignedDevice *adev, int
> > > > > entry) +{
> > > > > +    struct kvm_assigned_msix_entry msix_entry;
> > > > > +    int r;
> > > > > +
> > > > > +    memset(&msix_entry, 0, sizeof msix_entry);
> > > > > +    msix_entry.assigned_dev_id = calc_assigned_dev_id(adev->h_segnr,
> > > > > +            adev->h_busnr, (uint8_t)adev->h_devfn);
> > > > > +    msix_entry.entry = entry;
> > > > > +    msix_entry.flags = KVM_MSIX_FLAG_QUERY_MASK;
> > > > > +    r = kvm_assign_get_msix_entry(kvm_context, &msix_entry);
> > > > > +    if (r) {
> > > > > +        fprintf(stderr, "assigned_dev_msix_entry_masked: "
> > > > > +			"Fail to get mask bit of entry %d\n", entry);
> > > > > +        return 1;
> > > > 
> > > > This error handling seems pretty useless. assert?
> > > 
> > > Maybe we can continue with it. Assert seems a little strict.
> > 
> > Well need to consider whether to return 1 or 0 then.
> 
> Tell it it's masked then routing change may be continue. 

OK, then
1. we probably want to print this only once, not on each write
2. only try doing this if capability is present.

> > 
> > > > > +    }
> > > > > +    return (msix_entry.flags & KVM_MSIX_FLAG_MASK);
> > > > > +}
> > > > > +#endif
> > > > > +
> > > > > 
> > > > >  static int get_msix_valid_entries_nr(AssignedDevice *adev,
> > > > >  
> > > > >  				     uint16_t entries_max_nr)
> > > > >  
> > > > >  {
> > > > > 
> > > > > @@ -1136,7 +1170,11 @@ static int
> > > > > get_msix_valid_entries_nr(AssignedDevice *adev,
> > > > > 
> > > > >          memcpy(&msg_ctrl, va + i * 16 + 12, 4);
> > > > >          memcpy(&msg_data, va + i * 16 + 8, 4);
> > > > >          /* Ignore unused entry even it's unmasked */
> > > > > 
> > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > +        if (assigned_dev_msix_entry_masked(adev, i))
> > > > > +#else
> > > > > 
> > > > >          if (msg_data == 0)
> > > > > 
> > > > > +#endif
> > > > 
> > > > So, we are replacing msg_data == 0 check with masked check?
> > > > If yes why not do this for non-KVM_CAP_DEVICE_MSIX_MASK too?
> > > 
> > > Because the old one doesn't have idea about mask bit...
> > 
> > So track mask bits in userspace.
> 
> That's not the target of this patch. And it still won't work.

Sure it will, at least to figure out # of entries.

> > 
> > > > >              continue;
> > > > >          
> > > > >          entries_nr ++;
> > > > >      
> > > > >      }
> > > > > 
> > > > > @@ -1165,6 +1203,8 @@ static int
> > > > > assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
> > > > > 
> > > > >      }
> > > > >      
> > > > >      free_dev_irq_entries(adev);
> > > > > 
> > > > > +    memset(pci_dev->msix_entry_used, 0, KVM_MAX_MSIX_PER_DEV *
> > > > > +
> > > > > sizeof(*pci_dev->msix_entry_used));
> > > > > 
> > > > >      adev->irq_entries_nr = entries_nr;
> > > > >      adev->entry = calloc(entries_nr, sizeof(struct
> > > > >      kvm_irq_routing_entry)); if (!adev->entry) {
> > > > > 
> > > > > @@ -1179,7 +1219,11 @@ static int
> > > > > assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
> > > > > 
> > > > >              break;
> > > > >          
> > > > >          memcpy(&msg_ctrl, va + i * 16 + 12, 4);
> > > > >          memcpy(&msg_data, va + i * 16 + 8, 4);
> > > > > 
> > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > +        if (assigned_dev_msix_entry_masked(adev, i))
> > > > > +#else
> > > > > 
> > > > >          if (msg_data == 0)
> > > > > 
> > > > > +#endif
> > > > 
> > > > You can't use ifdef to check that kernel supports an ioctl.
> > > > You must check this at runtime.
> > > 
> > > Maybe we can convert them in bulk later.
> > 
> > Well you are adding a bug, all writes on old kernels
> > will spew out a ton of stderr.
> > We can't just assume new kernels.
> > 
> > > > >              continue;
> > > > >          
> > > > >          memcpy(&msg_addr, va + i * 16, 4);
> > > > > 
> > > > > @@ -1200,6 +1244,7 @@ static int
> > > > > assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
> > > > > 
> > > > >          msix_entry.gsi = adev->entry[entries_nr].gsi;
> > > > >          msix_entry.entry = i;
> > > > > 
> > > > > +        pci_dev->msix_entry_used[i] = 1;
> > > > > 
> > > > >          r = kvm_assign_set_msix_entry(kvm_context, &msix_entry);
> > > > >          if (r) {
> > > > >          
> > > > >              fprintf(stderr, "fail to set MSI-X entry! %s\n",
> > > > >              strerror(-r));
> > > > > 
> > > > > @@ -1243,6 +1288,8 @@ static void assigned_dev_update_msix(PCIDevice
> > > > > *pci_dev, int enable_msix)
> > > > > 
> > > > >              perror("assigned_dev_update_msix: deassign irq");
> > > > >          
> > > > >          assigned_dev->irq_requested_type = 0;
> > > > > 
> > > > > +        memset(pci_dev->msix_entry_used, 0, KVM_MAX_MSIX_PER_DEV *
> > > > > +
> > > > > sizeof(*pci_dev->msix_entry_used));
> > > > > 
> > > > >      }
> > > > >      
> > > > >      entries_max_nr = get_msix_entries_max_nr(assigned_dev);
> > > > > 
> > > > > @@ -1250,10 +1297,16 @@ static void
> > > > > assigned_dev_update_msix(PCIDevice *pci_dev, int enable_msix)
> > > > > 
> > > > >          fprintf(stderr, "assigned_dev_update_msix: MSI-X
> > > > >          entries_max_nr == 0"); return;
> > > > >      
> > > > >      }
> > > > > 
> > > > > +    /*
> > > > > +     * Guest may try to enable MSI-X before setting MSI-X entry
> > > > > done, so +     * let's wait until guest unmask the entries.
> > > > > +     */
> > > > 
> > > > Well it can also set up any number of entries, enable msix then
> > > > set up more entries. Now what?
> > > 
> > > It's the same. If it want to set up more entries, it have to unmask them.
> > > Then we would get them.
> > 
> > Why can't we handle the first enable in the same way?
> 
> Don't understand the question, but I guess the answer is pci_enable_msix().

pci_enable_msix is an internal kernel API to enable msix, isn't it?
We are discussing qemu patch here.

> > 
> > > > >      entries_nr = get_msix_valid_entries_nr(assigned_dev,
> > > > >      entries_max_nr); if (entries_nr == 0) {
> > > > > 
> > > > > +#ifndef KVM_CAP_DEVICE_MSIX_MASK
> > > > > 
> > > > >          if (enable_msix)
> > > > >          
> > > > >              fprintf(stderr, "MSI-X entry number is zero!\n");
> > > > 
> > > > And what happens then?
> > > 
> > > MSI-X can't work for old ones without MSIX mask support.
> > 
> > Old ones?
> 
> Old userspace without KVM_CAP_DEVICE_MSIX_MASK
> > 
> > > Reload the guest module
> > > may help.
> > 
> > Guest might not have any concept of modules.
> 
> Just an workaround. Not a suggestion method.

Look, if there's some failure mode we need a way to recover,
or to make it very unlikely. If this is guest doing something
illegal let's add a comment explaining what it is and why
it's illegal. If this is legal, why the print out?

> --
> regards
> Yang, Sheng
> 
> > 
> > > --
> > > regards
> > > Yang, Sheng
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html