On Sun, 22 Mar 2020 05:33:14 -0700 "Liu, Yi L" <yi.l.liu@xxxxxxxxx> wrote: > From: Liu Yi L <yi.l.liu@xxxxxxxxx> > > Per PCIe r5.0, sec 9.3.7.14, if a PF implements the PASID Capability, the > PF PASID configuration is shared by its VFs. VFs must not implement their > own PASID Capability. > > Per PCIe r5.0, sec 9.3.7.11, VFs must not implement the PRI Capability. If > the PF implements PRI, it is shared by the VFs. > > On bare metal, it has been fixed by below efforts. > to PASID/PRI are > https://lkml.org/lkml/2019/9/5/996 > https://lkml.org/lkml/2019/9/5/995 > > This patch adds emulated PASID/PRI capabilities for VFs when assigned to > VMs via vfio-pci driver. This is required for enabling vSVA on pass-through > VFs as VFs have no PASID/PRI capability structure in its configure space. > > Cc: Kevin Tian <kevin.tian@xxxxxxxxx> > CC: Jacob Pan <jacob.jun.pan@xxxxxxxxxxxxxxx> > Cc: Alex Williamson <alex.williamson@xxxxxxxxxx> > Cc: Eric Auger <eric.auger@xxxxxxxxxx> > Cc: Jean-Philippe Brucker <jean-philippe@xxxxxxxxxx> > Signed-off-by: Liu Yi L <yi.l.liu@xxxxxxxxx> > --- > drivers/vfio/pci/vfio_pci_config.c | 325 ++++++++++++++++++++++++++++++++++++- > 1 file changed, 323 insertions(+), 2 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c > index 4b9af99..84b4ea0 100644 > --- a/drivers/vfio/pci/vfio_pci_config.c > +++ b/drivers/vfio/pci/vfio_pci_config.c > @@ -1509,11 +1509,304 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) > return 0; > } > > +static int vfio_fill_custom_vconfig_bytes(struct vfio_pci_device *vdev, > + int offset, uint8_t *data, int size) > +{ > + int ret = 0, data_offset = 0; > + > + while (size) { > + int filled; > + > + if (size >= 4 && !(offset % 4)) { > + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; > + u32 dword; > + > + memcpy(&dword, data + data_offset, 4); > + *dwordp = cpu_to_le32(dword); Why wouldn't we do: *dwordp = cpu_to_le32(*(u32 *)(data + data_offset)); or better yet, increment data on each iteration for: *dwordp = cpu_to_le32(*(u32 *)data); vfio_fill_vconfig_bytes() does almost this same thing, getting the data from config space rather than a buffer, so please figure out how to avoid duplicating the logic. > + filled = 4; > + } else if (size >= 2 && !(offset % 2)) { > + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; > + u16 word; > + > + memcpy(&word, data + data_offset, 2); > + *wordp = cpu_to_le16(word); > + filled = 2; > + } else { > + u8 *byte = &vdev->vconfig[offset]; > + > + memcpy(byte, data + data_offset, 1); This one is really silly. vdev->vconfig[offset] = *data; > + filled = 1; > + } > + > + offset += filled; > + data_offset += filled; > + size -= filled; > + } > + > + return ret; > +} > + > +static int vfio_pci_get_ecap_content(struct pci_dev *pdev, > + int cap, int cap_len, u8 *content) > +{ > + int pos, offset, len = cap_len, ret = 0; > + > + pos = pci_find_ext_capability(pdev, cap); > + if (!pos) > + return -EINVAL; > + > + offset = 0; > + while (len) { > + int fetched; > + > + if (len >= 4 && !(pos % 4)) { > + u32 *dwordp = (u32 *) (content + offset); > + u32 dword; > + __le32 *dwptr = (__le32 *) &dword; > + > + ret = pci_read_config_dword(pdev, pos, &dword); > + if (ret) > + return ret; > + *dwordp = le32_to_cpu(*dwptr); WHAT??? pci_read_config_dword() returns cpu endian data! > + fetched = 4; > + } else if (len >= 2 && !(pos % 2)) { > + u16 *wordp = (u16 *) (content + offset); > + u16 word; > + __le16 *wptr = (__le16 *) &word; > + > + ret = pci_read_config_word(pdev, pos, &word); > + if (ret) > + return ret; > + *wordp = le16_to_cpu(*wptr); > + fetched = 2; > + } else { > + u8 *byte = (u8 *) (content + offset); > + > + ret = pci_read_config_byte(pdev, pos, byte); > + if (ret) > + return ret; > + fetched = 1; > + } > + > + pos += fetched; > + offset += fetched; > + len -= fetched; > + } > + > + return ret; > +} > + > +struct vfio_pci_pasid_cap_data { > + u32 id:16; > + u32 version:4; > + u32 next:12; > + union { > + u16 cap_reg_val; > + struct { > + u16 rsv1:1; > + u16 execs:1; > + u16 prvs:1; > + u16 rsv2:5; > + u16 pasid_bits:5; > + u16 rsv3:3; > + }; > + } cap_reg; > + union { > + u16 control_reg_val; > + struct { > + u16 paside:1; > + u16 exece:1; > + u16 prve:1; > + u16 rsv4:13; > + }; > + } control_reg; > +}; > + > +static int vfio_pci_add_pasid_cap(struct vfio_pci_device *vdev, > + struct pci_dev *pdev, > + u16 epos, u16 *next, __le32 **prevp) > +{ > + u8 *map = vdev->pci_config_map; > + int ecap = PCI_EXT_CAP_ID_PASID; > + int len = pci_ext_cap_length[ecap]; > + struct vfio_pci_pasid_cap_data pasid_cap; > + struct vfio_pci_pasid_cap_data vpasid_cap; > + int ret; > + > + /* > + * If no cap filled in this function, should make sure the next > + * pointer points to current epos. > + */ > + *next = epos; > + > + if (!len) { > + pr_info("%s: VF %s hiding PASID capability\n", > + __func__, dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } No! Why? This is dead code. > + > + /* Add PASID capability*/ > + ret = vfio_pci_get_ecap_content(pdev, ecap, > + len, (u8 *)&pasid_cap); Why wouldn't we just use vfio_fill_config_bytes() rather than this ridiculous approach of coping it to a buffer, modifying it and copying it out? > + if (ret) > + goto out; > + > + if (!pasid_cap.control_reg.paside) { > + pr_debug("%s: its PF's PASID capability is not enabled\n", > + dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } What happens if the PF's PASID gets disabled while we're using it?? > + > + memcpy(&vpasid_cap, &pasid_cap, len); > + > + vpasid_cap.id = 0x18; What is going on here? #define PCI_EXT_CAP_ID_LTR 0x18 /* Latency Tolerance Reporting */ > + vpasid_cap.next = 0; > + /* clear the control reg for guest */ > + memset(&vpasid_cap.control_reg, 0x0, > + sizeof(vpasid_cap.control_reg)); So we zero a couple registers and that's why we need a structure to define the entire capability and this crazy copy to a cpu endian buffer. No. > + > + memset(map + epos, vpasid_cap.id, len); See below. > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > + (u8 *)&vpasid_cap, len); > + if (!ret) { > + /* > + * Successfully filled in PASID cap, update > + * the next offset in previous cap header, > + * and also update caller about the offset > + * of next cap if any. > + */ > + u32 val = epos; > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > + **prevp |= cpu_to_le32(val << 20); > + *prevp = (__le32 *)&vdev->vconfig[epos]; > + *next = epos + len; Could we make this any more complicated? > + } > + > +out: > + return ret; > +} > + > +struct vfio_pci_pri_cap_data { > + u32 id:16; > + u32 version:4; > + u32 next:12; > + union { > + u16 control_reg_val; > + struct { > + u16 enable:1; > + u16 reset:1; > + u16 rsv1:14; > + }; > + } control_reg; > + union { > + u16 status_reg_val; > + struct { > + u16 rf:1; > + u16 uprgi:1; > + u16 rsv2:6; > + u16 stop:1; > + u16 rsv3:6; > + u16 pasid_required:1; > + }; > + } status_reg; > + u32 prq_capacity; > + u32 prq_quota; > +}; > + > +static int vfio_pci_add_pri_cap(struct vfio_pci_device *vdev, > + struct pci_dev *pdev, > + u16 epos, u16 *next, __le32 **prevp) > +{ > + u8 *map = vdev->pci_config_map; > + int ecap = PCI_EXT_CAP_ID_PRI; > + int len = pci_ext_cap_length[ecap]; > + struct vfio_pci_pri_cap_data pri_cap; > + struct vfio_pci_pri_cap_data vpri_cap; > + int ret; > + > + /* > + * If no cap filled in this function, should make sure the next > + * pointer points to current epos. > + */ > + *next = epos; > + > + if (!len) { > + pr_info("%s: VF %s hiding PRI capability\n", > + __func__, dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } > + > + /* Add PASID capability*/ > + ret = vfio_pci_get_ecap_content(pdev, ecap, > + len, (u8 *)&pri_cap); > + if (ret) > + goto out; > + > + if (!pri_cap.control_reg.enable) { > + pr_debug("%s: its PF's PRI capability is not enabled\n", > + dev_name(&vdev->pdev->dev)); > + ret = 0; > + goto out; > + } > + > + memcpy(&vpri_cap, &pri_cap, len); > + > + vpri_cap.id = 0x19; #define PCI_EXT_CAP_ID_SECPCI 0x19 /* Secondary PCIe Capability */ ??? > + vpri_cap.next = 0; > + /* clear the control reg for guest */ > + memset(&vpri_cap.control_reg, 0x0, > + sizeof(vpri_cap.control_reg)); > + > + memset(map + epos, vpri_cap.id, len); > + ret = vfio_fill_custom_vconfig_bytes(vdev, epos, > + (u8 *)&vpri_cap, len); > + if (!ret) { > + /* > + * Successfully filled in PASID cap, update > + * the next offset in previous cap header, > + * and also update caller about the offset > + * of next cap if any. > + */ > + u32 val = epos; > + **prevp &= cpu_to_le32(~(0xffcU << 20)); > + **prevp |= cpu_to_le32(val << 20); > + *prevp = (__le32 *)&vdev->vconfig[epos]; > + *next = epos + len; > + } > + > +out: > + return ret; > +} > + > +static int vfio_pci_add_emulated_cap_for_vf(struct vfio_pci_device *vdev, > + struct pci_dev *pdev, u16 start_epos, __le32 *prev) > +{ > + __le32 *__prev = prev; > + u16 epos = start_epos, epos_next = start_epos; > + int ret = 0; > + > + /* Add PASID capability*/ > + ret = vfio_pci_add_pasid_cap(vdev, pdev, epos, > + &epos_next, &__prev); > + if (ret) > + return ret; > + > + /* Add PRI capability */ > + epos = epos_next; > + ret = vfio_pci_add_pri_cap(vdev, pdev, epos, > + &epos_next, &__prev); > + > + return ret; > +} > + > static int vfio_ecap_init(struct vfio_pci_device *vdev) > { > struct pci_dev *pdev = vdev->pdev; > u8 *map = vdev->pci_config_map; > - u16 epos; > + u16 epos, epos_max; > __le32 *prev = NULL; > int loops, ret, ecaps = 0; > > @@ -1521,6 +1814,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > return 0; > > epos = PCI_CFG_SPACE_SIZE; > + epos_max = PCI_CFG_SPACE_SIZE; > > loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; > > @@ -1545,6 +1839,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > } > } > > + if (epos_max <= (epos + len)) > + epos_max = epos + len; > + > if (!len) { > pci_info(pdev, "%s: hiding ecap %#x@%#x\n", > __func__, ecap, epos); > @@ -1604,6 +1901,18 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) > if (!ecaps) > *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; > > +#ifdef CONFIG_PCI_ATS > + if (pdev->is_virtfn) { > + struct pci_dev *physfn = pdev->physfn; > + > + ret = vfio_pci_add_emulated_cap_for_vf(vdev, > + physfn, epos_max, prev); > + if (ret) > + pr_info("%s, failed to add special caps for VF %s\n", > + __func__, dev_name(&vdev->pdev->dev)); > + } > +#endif I can only imagine that we should place the caps at the same location they exist on the PF, we don't know what hidden registers might be hiding in config space. > + > return 0; > } > > @@ -1748,6 +2057,17 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, > return i; > } > > +static bool vfio_pci_need_virt_perm(struct pci_dev *pdev, u8 cap_id) > +{ > +#ifdef CONFIG_PCI_ATS > + return (pdev->is_virtfn && > + (cap_id == PCI_EXT_CAP_ID_PASID || > + cap_id == PCI_EXT_CAP_ID_PRI)); > +#else > + return false; > +#endif > +} > + > static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, > size_t count, loff_t *ppos, bool iswrite) > { > @@ -1781,7 +2101,8 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, > if (cap_id == PCI_CAP_ID_INVALID) { > perm = &unassigned_perms; > cap_start = *ppos; > - } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { > + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT || > + vfio_pci_need_virt_perm(pdev, cap_id)) { Why not simply fill the map with PCI_CAP_ID_INVALID_VIRT? > perm = &virt_perms; > cap_start = *ppos; > } else { This is really not close to acceptable as is. Sorry. Thanks, Alex