On Thu, 3 Aug 2023 10:41:09 -0400 Stefan Hajnoczi <stefanha@xxxxxxxxxx> wrote: > The VFIO_DEVICE_GET_INFO, VFIO_DEVICE_GET_REGION_INFO, and > VFIO_IOMMU_GET_INFO ioctls fill in an info struct followed by capability > structs: > > +------+---------+---------+-----+ > | info | caps[0] | caps[1] | ... | > +------+---------+---------+-----+ > > Both the info and capability struct sizes are not always multiples of > sizeof(u64), leaving u64 fields in later capability structs misaligned. > > Userspace applications currently need to handle misalignment manually in > order to support CPU architectures and programming languages with strict > alignment requirements. > > Make life easier for userspace by ensuring alignment in the kernel. > The new layout is as follows: > > +------+---+---------+---------+---+-----+ > | info | 0 | caps[0] | caps[1] | 0 | ... | > +------+---+---------+---------+---+-----+ > > In this example info and caps[1] have sizes that are not multiples of > sizeof(u64), so zero padding is added to align the subsequent structure. > > Adding zero padding between structs does not break the uapi. The memory > layout is specified by the info.cap_offset and caps[i].next fields > filled in by the kernel. Applications use these field values to locate > structs and are therefore unaffected by the addition of zero padding. > > Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxx> > --- > include/linux/vfio.h | 2 +- > drivers/gpu/drm/i915/gvt/kvmgt.c | 7 +++-- > drivers/s390/cio/vfio_ccw_ops.c | 7 +++-- > drivers/vfio/pci/vfio_pci_core.c | 14 ++++++--- > drivers/vfio/vfio_iommu_type1.c | 7 +++-- > drivers/vfio/vfio_main.c | 53 +++++++++++++++++++++++++++----- > 6 files changed, 71 insertions(+), 19 deletions(-) > > diff --git a/include/linux/vfio.h b/include/linux/vfio.h > index 2c137ea94a3e..ff0864e73cc3 100644 > --- a/include/linux/vfio.h > +++ b/include/linux/vfio.h > @@ -272,7 +272,7 @@ struct vfio_info_cap { > struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, > size_t size, u16 id, > u16 version); > -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset); > +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset); > > int vfio_info_add_capability(struct vfio_info_cap *caps, > struct vfio_info_cap_header *cap, size_t size); > diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c > index de675d799c7d..9060e9c6ac7c 100644 > --- a/drivers/gpu/drm/i915/gvt/kvmgt.c > +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c > @@ -1297,7 +1297,10 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, > info.argsz = sizeof(info) + caps.size; > info.cap_offset = 0; > } else { > - vfio_info_cap_shift(&caps, sizeof(info)); > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info)); > + if (cap_offset < 0) > + return cap_offset; > + > if (copy_to_user((void __user *)arg + > sizeof(info), caps.buf, > caps.size)) { > @@ -1305,7 +1308,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, > kfree(sparse); > return -EFAULT; > } > - info.cap_offset = sizeof(info); > + info.cap_offset = cap_offset; The copy_to_user() above needs to be modified to make this true: copy_to_user((void __user *)arg + cap_offset,... Same for all similar below. > } > > kfree(caps.buf); > diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c > index 5b53b94f13c7..63d5163376a5 100644 > --- a/drivers/s390/cio/vfio_ccw_ops.c > +++ b/drivers/s390/cio/vfio_ccw_ops.c > @@ -361,13 +361,16 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, > info->argsz = sizeof(*info) + caps.size; > info->cap_offset = 0; > } else { > - vfio_info_cap_shift(&caps, sizeof(*info)); > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(*info)); > + if (cap_offset < 0) > + return cap_offset; > + > if (copy_to_user((void __user *)arg + sizeof(*info), > caps.buf, caps.size)) { > kfree(caps.buf); > return -EFAULT; > } > - info->cap_offset = sizeof(*info); > + info->cap_offset = cap_offset; > } > > kfree(caps.buf); > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index 20d7b69ea6ff..92c093b99187 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -966,12 +966,15 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, > if (info.argsz < sizeof(info) + caps.size) { > info.argsz = sizeof(info) + caps.size; > } else { > - vfio_info_cap_shift(&caps, sizeof(info)); > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info)); > + if (cap_offset < 0) > + return cap_offset; > + > if (copy_to_user(arg + 1, caps.buf, caps.size)) { > kfree(caps.buf); > return -EFAULT; > } > - info.cap_offset = sizeof(*arg); > + info.cap_offset = cap_offset; > } > > kfree(caps.buf); > @@ -1107,12 +1110,15 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, > info.argsz = sizeof(info) + caps.size; > info.cap_offset = 0; > } else { > - vfio_info_cap_shift(&caps, sizeof(info)); > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info)); > + if (cap_offset < 0) > + return cap_offset; > + > if (copy_to_user(arg + 1, caps.buf, caps.size)) { > kfree(caps.buf); > return -EFAULT; > } > - info.cap_offset = sizeof(*arg); > + info.cap_offset = cap_offset; > } > > kfree(caps.buf); > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index ebe0ad31d0b0..ab64b9e3ed7c 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -2808,14 +2808,17 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, > if (info.argsz < sizeof(info) + caps.size) { > info.argsz = sizeof(info) + caps.size; > } else { > - vfio_info_cap_shift(&caps, sizeof(info)); > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info)); > + if (cap_offset < 0) > + return cap_offset; > + > if (copy_to_user((void __user *)arg + > sizeof(info), caps.buf, > caps.size)) { > kfree(caps.buf); > return -EFAULT; > } > - info.cap_offset = sizeof(info); > + info.cap_offset = cap_offset; > } > > kfree(caps.buf); > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c > index f0ca33b2e1df..4fc8698577a7 100644 > --- a/drivers/vfio/vfio_main.c > +++ b/drivers/vfio/vfio_main.c > @@ -1171,8 +1171,18 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, > { > void *buf; > struct vfio_info_cap_header *header, *tmp; > + size_t header_offset; > + size_t new_size; > > - buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); > + /* > + * Reserve extra space when the previous capability was not a multiple of > + * the largest field size. This ensures that capabilities are properly > + * aligned. > + */ If we simply start with: size = ALIGN(size, sizeof(u64)); then shouldn't there never be a previous misaligned size to correct? I wonder if we really need all this complexity, we're drawing from a finite set of info structs for the initial alignment, we can pad those without breaking the uapi and we can introduce a warning to avoid such poor alignment in the future. Allocating an aligned size for each capability is then sufficiently trivial to handle runtime. ex: diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 902f06e52c48..2d074cbd371d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1362,6 +1362,8 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, void *buf; struct vfio_info_cap_header *header, *tmp; + size = ALIGN(size, sizeof(u64)); + buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); if (!buf) { kfree(caps->buf); @@ -1395,6 +1397,8 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) struct vfio_info_cap_header *tmp; void *buf = (void *)caps->buf; + WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); + for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) tmp->next += offset; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fa06e3eb4955..fd2761841ffe 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -217,6 +217,7 @@ struct vfio_device_info { __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ + __u32 pad; /* Size must be aligned for caps */ }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) @@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info { #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ __u64 iova_pgsizes; /* Bitmap of supported page sizes */ __u32 cap_offset; /* Offset within info struct of first cap */ + __u32 pad; /* Size must be aligned for caps */ }; /* Thanks, Alex > + header_offset = ALIGN(caps->size, sizeof(u64)); > + new_size = header_offset + size; > + > + buf = krealloc(caps->buf, new_size, GFP_KERNEL); > if (!buf) { > kfree(caps->buf); > caps->buf = NULL; > @@ -1181,10 +1191,10 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, > } > > caps->buf = buf; > - header = buf + caps->size; > + header = buf + header_offset; > > /* Eventually copied to user buffer, zero */ > - memset(header, 0, size); > + memset(buf + caps->size, 0, new_size - caps->size); > > header->id = id; > header->version = version; > @@ -1193,20 +1203,47 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, > for (tmp = buf; tmp->next; tmp = buf + tmp->next) > ; /* nothing */ > > - tmp->next = caps->size; > - caps->size += size; > + tmp->next = header_offset; > + caps->size = new_size; > > return header; > } > EXPORT_SYMBOL_GPL(vfio_info_cap_add); > > -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) > +/* > + * Adjust the capability next fields to account for the given offset at which > + * capability structures start and any padding added for alignment. Returns the > + * cap_offset or -errno. > + */ > +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) > { > struct vfio_info_cap_header *tmp; > + struct vfio_info_cap_header *next_tmp; > void *buf = (void *)caps->buf; > + size_t pad = ALIGN(offset, sizeof(u64)) - offset; > + size_t cap_offset = offset + pad; > > - for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) > - tmp->next += offset; > + /* Shift the next fields to account for offset and pad */ > + for (tmp = buf; tmp->next; tmp = next_tmp) { > + next_tmp = buf + tmp->next; > + tmp->next += cap_offset; > + } > + > + /* Pad with zeroes so capabilities start with proper alignment */ > + buf = krealloc(caps->buf, caps->size + pad, GFP_KERNEL); > + if (!buf) { > + kfree(caps->buf); > + caps->buf = NULL; > + caps->size = 0; > + return -ENOMEM; > + } > + > + memmove(buf + pad, buf, caps->size); > + memset(buf, 0, pad); > + > + caps->buf = buf; > + caps->size += pad; > + return cap_offset; > } > EXPORT_SYMBOL(vfio_info_cap_shift); >