Re: [PATCH 3/3] qemu-kvm: device assignment: emulate MSI-X mask bits

Sheng Yang <sheng@xxxxxxxxxxxxxxx> · Mon, 8 Nov 2010 13:17:47 +0800

On Friday 05 November 2010 21:54:16 Michael S. Tsirkin wrote:
> On Fri, Nov 05, 2010 at 07:02:02PM +0800, Sheng Yang wrote:
> > On Friday 05 November 2010 17:05:32 Michael S. Tsirkin wrote:
> > > On Fri, Nov 05, 2010 at 11:20:37AM +0800, Sheng Yang wrote:
> > > > On Thursday 04 November 2010 17:44:27 Michael S. Tsirkin wrote:
> > > > 
> > > > Thanks very much for reviewing this! Seems nobody cares about
> > > > userspace one before...
> > > > 
> > > > > On Thu, Nov 04, 2010 at 02:18:21PM +0800, Sheng Yang wrote:
> > > > > > This patch emulated MSI-X per vector mask bit on assigned device.
> > > > > > 
> > > > > > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx>
> > > > > > ---
> > > > > > 
> > > > > >  hw/device-assignment.c |  161
> > > > > >  ++++++++++++++++++++++++++++++++++++++++++++++-- 1 
files
> > > > 
> > > > changed, 155
> > > > 
> > > > > >  insertions(+), 6 deletions(-)
> > > > > > 
> > > > > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> > > > > > index 8a98876..639aa0b 100644
> > > > > > --- a/hw/device-assignment.c
> > > > > > +++ b/hw/device-assignment.c
> > > > > > @@ -62,6 +62,11 @@ static void
> > > > > > assigned_dev_load_option_rom(AssignedDevice *dev);
> > > > > > 
> > > > > >  static void assigned_dev_unregister_msix_mmio(AssignedDevice
> > > > > >  *dev);
> > > > > > 
> > > > > > +static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus,
> > > > > > uint8_t devfn) +{
> > > > > > +    return (uint32_t)seg << 16 | (uint32_t)bus << 8 |
> > > > > > (uint32_t)devfn; +}
> > > > > > +
> > > > > > 
> > > > > >  static uint32_t assigned_dev_ioport_rw(AssignedDevRegion
> > > > > >  *dev_region,
> > > > > >  
> > > > > >                                         uint32_t addr, int len,
> > > > > >                                         uint32_t *val)
> > > > > >  
> > > > > >  {
> > > > > > 
> > > > > > @@ -264,6 +269,9 @@ static void assigned_dev_iomem_map(PCIDevice
> > > > > > *pci_dev, int region_num,
> > > > > > 
> > > > > >      AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> > > > > >      PCIRegion *real_region =
> > > > > >      &r_dev->real_device.regions[region_num]; int ret = 0;
> > > > > > 
> > > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > > +    struct kvm_assigned_msix_mmio msix_mmio;
> > > > > > +#endif
> > > > > > 
> > > > > >      DEBUG("e_phys=%08" FMT_PCIBUS " r_virt=%p type=%d len=%08"
> > > > > >      FMT_PCIBUS " region_num=%d \n",
> > > > > >      
> > > > > >            e_phys, region->u.r_virtbase, type, e_size,
> > > > > >            region_num);
> > > > > > 
> > > > > > @@ -282,6 +290,16 @@ static void assigned_dev_iomem_map(PCIDevice
> > > > > > *pci_dev, int region_num,
> > > > > > 
> > > > > >              cpu_register_physical_memory(e_phys + offset,
> > > > > >              
> > > > > >                      TARGET_PAGE_SIZE, r_dev->mmio_index);
> > > > > > 
> > > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > > +	    memset(&msix_mmio, 0, sizeof(struct
> > > > > > kvm_assigned_msix_mmio)); +	    msix_mmio.assigned_dev_id =
> > > > > > calc_assigned_dev_id(r_dev->h_segnr, +			    r_dev-
>h_busnr,
> > > > > > r_dev->h_devfn);
> > > > > > +	    msix_mmio.base_addr = e_phys + offset;
> > > > > > +            /* We required kernel MSI-X support */
> > > > > > +	    ret = kvm_assign_reg_msix_mmio(kvm_context, &msix_mmio);
> > > > > > +	    if (ret)
> > > > > > +                fprintf(stderr, "fail to register in-kernel
> > > > > > msix_mmio!\n"); +#endif
> > > > > > 
> > > > > >          }
> > > > > >      
> > > > > >      }
> > > > > > 
> > > > > > @@ -824,11 +842,6 @@ static void
> > > > > > free_assigned_device(AssignedDevice *dev)
> > > > > > 
> > > > > >      }
> > > > > >  
> > > > > >  }
> > > > > > 
> > > > > > -static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus,
> > > > > > uint8_t devfn) -{
> > > > > > -    return (uint32_t)seg << 16 | (uint32_t)bus << 8 |
> > > > > > (uint32_t)devfn; -}
> > > > > > -
> > > > > > 
> > > > > >  static void assign_failed_examine(AssignedDevice *dev)
> > > > > >  {
> > > > > >  
> > > > > >      char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {},
> > > > > >      *ns;
> > > > > > 
> > > > > > @@ -1123,6 +1136,27 @@ static int
> > > > > > get_msix_entries_max_nr(AssignedDevice *adev)
> > > > > > 
> > > > > >      return entries_max_nr;
> > > > > >  
> > > > > >  }
> > > > > > 
> > > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > > +static int assigned_dev_msix_entry_masked(AssignedDevice *adev,
> > > > > > int entry) +{
> > > > > > +    struct kvm_assigned_msix_entry msix_entry;
> > > > > > +    int r;
> > > > > > +
> > > > > > +    memset(&msix_entry, 0, sizeof msix_entry);
> > > > > > +    msix_entry.assigned_dev_id =
> > > > > > calc_assigned_dev_id(adev->h_segnr, +            adev->h_busnr,
> > > > > > (uint8_t)adev->h_devfn);
> > > > > > +    msix_entry.entry = entry;
> > > > > > +    msix_entry.flags = KVM_MSIX_FLAG_QUERY_MASK;
> > > > > > +    r = kvm_assign_get_msix_entry(kvm_context, &msix_entry);
> > > > > > +    if (r) {
> > > > > > +        fprintf(stderr, "assigned_dev_msix_entry_masked: "
> > > > > > +			"Fail to get mask bit of entry %d\n", entry);
> > > > > > +        return 1;
> > > > > 
> > > > > This error handling seems pretty useless. assert?
> > > > 
> > > > Maybe we can continue with it. Assert seems a little strict.
> > > 
> > > Well need to consider whether to return 1 or 0 then.
> > 
> > Tell it it's masked then routing change may be continue.
> 
> OK, then
> 1. we probably want to print this only once, not on each write
> 2. only try doing this if capability is present.
> 
> > > > > > +    }
> > > > > > +    return (msix_entry.flags & KVM_MSIX_FLAG_MASK);
> > > > > > +}
> > > > > > +#endif
> > > > > > +
> > > > > > 
> > > > > >  static int get_msix_valid_entries_nr(AssignedDevice *adev,
> > > > > >  
> > > > > >  				     uint16_t entries_max_nr)
> > > > > >  
> > > > > >  {
> > > > > > 
> > > > > > @@ -1136,7 +1170,11 @@ static int
> > > > > > get_msix_valid_entries_nr(AssignedDevice *adev,
> > > > > > 
> > > > > >          memcpy(&msg_ctrl, va + i * 16 + 12, 4);
> > > > > >          memcpy(&msg_data, va + i * 16 + 8, 4);
> > > > > >          /* Ignore unused entry even it's unmasked */
> > > > > > 
> > > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > > +        if (assigned_dev_msix_entry_masked(adev, i))
> > > > > > +#else
> > > > > > 
> > > > > >          if (msg_data == 0)
> > > > > > 
> > > > > > +#endif
> > > > > 
> > > > > So, we are replacing msg_data == 0 check with masked check?
> > > > > If yes why not do this for non-KVM_CAP_DEVICE_MSIX_MASK too?
> > > > 
> > > > Because the old one doesn't have idea about mask bit...
> > > 
> > > So track mask bits in userspace.
> > 
> > That's not the target of this patch. And it still won't work.
> 
> Sure it will, at least to figure out # of entries.

OK, we can only tracking them.

> > > > > >              continue;
> > > > > >          
> > > > > >          entries_nr ++;
> > > > > >      
> > > > > >      }
> > > > > > 
> > > > > > @@ -1165,6 +1203,8 @@ static int
> > > > > > assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
> > > > > > 
> > > > > >      }
> > > > > >      
> > > > > >      free_dev_irq_entries(adev);
> > > > > > 
> > > > > > +    memset(pci_dev->msix_entry_used, 0, KVM_MAX_MSIX_PER_DEV *
> > > > > > +
> > > > > > sizeof(*pci_dev->msix_entry_used));
> > > > > > 
> > > > > >      adev->irq_entries_nr = entries_nr;
> > > > > >      adev->entry = calloc(entries_nr, sizeof(struct
> > > > > >      kvm_irq_routing_entry)); if (!adev->entry) {
> > > > > > 
> > > > > > @@ -1179,7 +1219,11 @@ static int
> > > > > > assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
> > > > > > 
> > > > > >              break;
> > > > > >          
> > > > > >          memcpy(&msg_ctrl, va + i * 16 + 12, 4);
> > > > > >          memcpy(&msg_data, va + i * 16 + 8, 4);
> > > > > > 
> > > > > > +#ifdef KVM_CAP_DEVICE_MSIX_MASK
> > > > > > +        if (assigned_dev_msix_entry_masked(adev, i))
> > > > > > +#else
> > > > > > 
> > > > > >          if (msg_data == 0)
> > > > > > 
> > > > > > +#endif
> > > > > 
> > > > > You can't use ifdef to check that kernel supports an ioctl.
> > > > > You must check this at runtime.
> > > > 
> > > > Maybe we can convert them in bulk later.
> > > 
> > > Well you are adding a bug, all writes on old kernels
> > > will spew out a ton of stderr.
> > > We can't just assume new kernels.
> > > 
> > > > > >              continue;
> > > > > >          
> > > > > >          memcpy(&msg_addr, va + i * 16, 4);
> > > > > > 
> > > > > > @@ -1200,6 +1244,7 @@ static int
> > > > > > assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
> > > > > > 
> > > > > >          msix_entry.gsi = adev->entry[entries_nr].gsi;
> > > > > >          msix_entry.entry = i;
> > > > > > 
> > > > > > +        pci_dev->msix_entry_used[i] = 1;
> > > > > > 
> > > > > >          r = kvm_assign_set_msix_entry(kvm_context, &msix_entry);
> > > > > >          if (r) {
> > > > > >          
> > > > > >              fprintf(stderr, "fail to set MSI-X entry! %s\n",
> > > > > >              strerror(-r));
> > > > > > 
> > > > > > @@ -1243,6 +1288,8 @@ static void
> > > > > > assigned_dev_update_msix(PCIDevice *pci_dev, int enable_msix)
> > > > > > 
> > > > > >              perror("assigned_dev_update_msix: deassign irq");
> > > > > >          
> > > > > >          assigned_dev->irq_requested_type = 0;
> > > > > > 
> > > > > > +        memset(pci_dev->msix_entry_used, 0, KVM_MAX_MSIX_PER_DEV
> > > > > > * +
> > > > > > sizeof(*pci_dev->msix_entry_used));
> > > > > > 
> > > > > >      }
> > > > > >      
> > > > > >      entries_max_nr = get_msix_entries_max_nr(assigned_dev);
> > > > > > 
> > > > > > @@ -1250,10 +1297,16 @@ static void
> > > > > > assigned_dev_update_msix(PCIDevice *pci_dev, int enable_msix)
> > > > > > 
> > > > > >          fprintf(stderr, "assigned_dev_update_msix: MSI-X
> > > > > >          entries_max_nr == 0"); return;
> > > > > >      
> > > > > >      }
> > > > > > 
> > > > > > +    /*
> > > > > > +     * Guest may try to enable MSI-X before setting MSI-X entry
> > > > > > done, so +     * let's wait until guest unmask the entries.
> > > > > > +     */
> > > > > 
> > > > > Well it can also set up any number of entries, enable msix then
> > > > > set up more entries. Now what?
> > > > 
> > > > It's the same. If it want to set up more entries, it have to unmask
> > > > them. Then we would get them.
> > > 
> > > Why can't we handle the first enable in the same way?
> > 
> > Don't understand the question, but I guess the answer is
> > pci_enable_msix().
> 
> pci_enable_msix is an internal kernel API to enable msix, isn't it?
> We are discussing qemu patch here.

OK, I understand the question now. The others entries would be enabled with 
unmask, but not very first ones. Guest can unmask the entries first, then enable 
MSI-X. We should scan the table when MSI-X enabled, that's why the first one is 
different from others.

> > > > > >      entries_nr = get_msix_valid_entries_nr(assigned_dev,
> > > > > >      entries_max_nr); if (entries_nr == 0) {
> > > > > > 
> > > > > > +#ifndef KVM_CAP_DEVICE_MSIX_MASK
> > > > > > 
> > > > > >          if (enable_msix)
> > > > > >          
> > > > > >              fprintf(stderr, "MSI-X entry number is zero!\n");
> > > > > 
> > > > > And what happens then?
> > > > 
> > > > MSI-X can't work for old ones without MSIX mask support.
> > > 
> > > Old ones?
> > 
> > Old userspace without KVM_CAP_DEVICE_MSIX_MASK
> > 
> > > > Reload the guest module
> > > > may help.
> > > 
> > > Guest might not have any concept of modules.
> > 
> > Just an workaround. Not a suggestion method.
> 
> Look, if there's some failure mode we need a way to recover,
> or to make it very unlikely. If this is guest doing something
> illegal let's add a comment explaining what it is and why
> it's illegal. If this is legal, why the print out?

It's unsupported in the old QEmu. I can change the comment, but I do think it's 
good to let user know something important is wrong. 

Maybe it's better to put it into DEBUG.

--
regards
Yang, Sheng

> 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > --
> > > > regards
> > > > Yang, Sheng
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html