Rusty Russell <rusty@xxxxxxxxxxxxxxx> writes: > I'll create some patches and see if it's too ugly to live... Hmm, with rough userspace testing I managed to get the speed penalty pretty low. Here are the two patches, inline: vringh: handle case where data goes across multiple ranges. QEMU seems to only use separate memory ranges for ROM vs RAM, but in theory a single scatterlist element referred to by a vring could cross two ranges, and thus need to be split into two separate iovec entries. This causes no measurable slowdown: Before: (average of 20 runs) ./vringh_test --indirect --eventidx --parallel real 0m3.236s After: (average of 20 runs) ./vringh_test --indirect --eventidx --parallel real 0m3.223s Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx> diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 2ba087d..50d37a7 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -91,33 +91,37 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, return done; } -static inline bool check_range(u64 addr, u32 len, +/* May reduce *len if range is shorter. */ +static inline bool check_range(u64 addr, u32 *len, struct vringh_range *range, bool (*getrange)(u64, struct vringh_range *)) { if (addr < range->start || addr > range->end_incl) { if (!getrange(addr, range)) - goto bad; + return false; } BUG_ON(addr < range->start || addr > range->end_incl); /* To end of memory? */ - if (unlikely(addr + len == 0)) { + if (unlikely(addr + *len == 0)) { if (range->end_incl == -1ULL) return true; - goto bad; + goto truncate; } /* Otherwise, don't wrap. */ - if (unlikely(addr + len < addr)) - goto bad; - if (unlikely(addr + len - 1 > range->end_incl)) - goto bad; + if (addr + *len < addr) { + vringh_bad("Wrapping descriptor %u@0x%llx", *len, addr); + return false; + } + + if (unlikely(addr + *len - 1 > range->end_incl)) + goto truncate; return true; -bad: - vringh_bad("Malformed descriptor address %u@0x%llx", len, addr); - return false; +truncate: + *len = range->end_incl + 1 - addr; + return true; } /* No reason for this code to be inline. */ @@ -205,19 +209,30 @@ __vringh_iov(struct vringh *vrh, u16 i, for (;;) { void *addr; struct vringh_kiov *iov; + u32 len; err = getdesc(&desc, &descs[i]); if (unlikely(err)) goto fail; - /* Make sure it's OK, and get offset. */ - if (!check_range(desc.addr, desc.len, &range, getrange)) { - err = -EINVAL; - goto fail; - } - addr = (void *)(long)desc.addr + range.offset; - if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!check_range(desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + + if (len != desc.len) { + vringh_bad("Indirect descriptor table across" + " ranges: %u@%#llx vs %#llx-%#llx", + desc.len, desc.addr, range.start, + range.end_incl); + err = -EINVAL; + goto fail; + } + addr = (void *)(long)(desc.addr + range.offset); + err = move_to_indirect(&up_next, &i, addr, &desc, &descs, &desc_max); if (err) @@ -243,6 +258,15 @@ __vringh_iov(struct vringh *vrh, u16 i, } } + again: + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!check_range(desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + addr = (void *)(unsigned long)(desc.addr + range.offset); + if (unlikely(iov->i == iov->max)) { err = resize_iovec(iov, gfp); if (err) @@ -250,9 +274,15 @@ __vringh_iov(struct vringh *vrh, u16 i, } iov->iov[iov->i].iov_base = addr; - iov->iov[iov->i].iov_len = desc.len; + iov->iov[iov->i].iov_len = len; iov->i++; + if (unlikely(len != desc.len)) { + desc.len -= len; + desc.addr += len; + goto again; + } + if (desc.flags & VRING_DESC_F_NEXT) { i = desc.next; } else { vringh: handle case where indirect descriptors go across multiple ranges. If a data segment can go across multiple ranges, so can an indirect descriptor table. This is nastier, so we explicitly slowpath this. This slows things down a little though. Before: (average of 20 runs) ./vringh_test --indirect --eventidx --parallel real 0m3.217s After: (average of 20 runs) ./vringh_test --indirect --eventidx --parallel real 0m3.364s Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx> diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 50d37a7..5dbf4b1 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -188,17 +188,46 @@ static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, return i; } +static int slow_copy(void *dst, const void *src, + bool (*getrange)(u64 addr, struct vringh_range *r), + struct vringh_range *range, + int (*copy)(void *dst, const void *src, size_t len)) +{ + size_t part, len = sizeof(struct vring_desc); + + do { + u64 addr; + int err; + + part = len; + addr = (u64)(unsigned long)src - range->offset; + + if (!check_range(addr, &part, range, getrange)) + return -EINVAL; + + err = copy(dst, src, part); + if (err) + return err; + + dst += part; + src += part; + len -= part; + } while (len); + return 0; +} + static inline int __vringh_iov(struct vringh *vrh, u16 i, struct vringh_kiov *riov, struct vringh_kiov *wiov, bool (*getrange)(u64 addr, struct vringh_range *r), gfp_t gfp, - int (*getdesc)(struct vring_desc *dst, const struct vring_desc *s)) + int (*copy)(void *dst, const void *src, size_t len)) { int err, count = 0, up_next, desc_max; struct vring_desc desc, *descs; - struct vringh_range range = { -1ULL, 0 }; + struct vringh_range range = { -1ULL, 0 }, slowrange; + bool slow = false; /* We start traversing vring's descriptor table. */ descs = vrh->vring.desc; @@ -211,7 +240,11 @@ __vringh_iov(struct vringh *vrh, u16 i, struct vringh_kiov *iov; u32 len; - err = getdesc(&desc, &descs[i]); + if (unlikely(slow)) + err = slow_copy(&desc, &descs[i], getrange, &slowrange, + copy); + else + err = copy(&desc, &descs[i], sizeof(desc)); if (unlikely(err)) goto fail; @@ -223,16 +256,13 @@ __vringh_iov(struct vringh *vrh, u16 i, goto fail; } - if (len != desc.len) { - vringh_bad("Indirect descriptor table across" - " ranges: %u@%#llx vs %#llx-%#llx", - desc.len, desc.addr, range.start, - range.end_incl); - err = -EINVAL; - goto fail; + if (unlikely(len != desc.len)) { + slow = true; + /* We need to save this range to use offset */ + slowrange = range; } - addr = (void *)(long)(desc.addr + range.offset); + addr = (void *)(long)(desc.addr + range.offset); err = move_to_indirect(&up_next, &i, addr, &desc, &descs, &desc_max); if (err) @@ -287,10 +317,11 @@ __vringh_iov(struct vringh *vrh, u16 i, i = desc.next; } else { /* Just in case we need to finish traversing above. */ - if (unlikely(up_next > 0)) + if (unlikely(up_next > 0)) { i = return_from_indirect(vrh, &up_next, &descs, &desc_max); - else + slow = false; + } else break; } @@ -479,10 +510,9 @@ static inline int putu16_user(u16 *p, u16 val) return put_user(val, (__force u16 __user *)p); } -static inline int getdesc_user(struct vring_desc *dst, - const struct vring_desc *src) +static inline int copydesc_user(void *dst, const void *src, size_t len) { - return copy_from_user(dst, (__force void __user *)src, sizeof(*dst)) ? + return copy_from_user(dst, (__force void __user *)src, len) ? -EFAULT : 0; } @@ -597,7 +627,7 @@ int vringh_getdesc_user(struct vringh *vrh, *head = err; err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, (struct vringh_kiov *)wiov, - getrange, gfp, getdesc_user); + getrange, gfp, copydesc_user); if (err) return err; @@ -711,10 +741,9 @@ static inline int putu16_kern(u16 *p, u16 val) return 0; } -static inline int getdesc_kern(struct vring_desc *dst, - const struct vring_desc *src) +static inline int copydesc_kern(void *dst, const void *src, size_t len) { - *dst = *src; + memcpy(dst, src, len); return 0; } @@ -806,7 +835,7 @@ int vringh_getdesc_kern(struct vringh *vrh, *head = err; err = __vringh_iov(vrh, *head, riov, wiov, noop_getrange, - gfp, getdesc_kern); + gfp, copydesc_kern); if (err) return err; diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c index 01ccaed..bc74c41 100644 --- a/tools/virtio/vringh_test.c +++ b/tools/virtio/vringh_test.c @@ -30,14 +30,33 @@ static void never_callback_guest(struct virtqueue *vq) abort(); } -static inline bool getrange_iov(u64 addr, struct vringh_range *r) +static bool getrange_iov(u64 addr, struct vringh_range *r) { + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset; r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset; r->offset = user_addr_offset; return true; } +/* We return single byte ranges. */ +static bool getrange_slow(u64 addr, struct vringh_range *r) +{ + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + + r->start = addr; + r->end_incl = r->start; + r->offset = user_addr_offset; + return true; +} + struct guest_virtio_device { struct virtio_device vdev; int to_host_fd; @@ -75,7 +94,8 @@ static void find_cpus(unsigned int *first, unsigned int *last) } } -static int parallel_test(unsigned long features) +static int parallel_test(unsigned long features, + bool (*getrange)(u64 addr, struct vringh_range *r)) { void *host_map, *guest_map; int fd, mapsize, to_guest[2], to_host[2]; @@ -144,7 +164,7 @@ static int parallel_test(unsigned long features) wiov.max = ARRAY_SIZE(host_wiov); wiov.allocated = false; - err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange_iov, + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head, GFP_KERNEL); if (err == 0) { char buf[128]; @@ -349,6 +369,7 @@ int main(int argc, char *argv[]) int err; unsigned i; void *ret; + bool (*getrange)(u64 addr, struct vringh_range *r) = getrange_iov; vdev.features[0] = 0; @@ -362,8 +383,13 @@ int main(int argc, char *argv[]) argv++; } + if (argv[1] && strcmp(argv[1], "--slow") == 0) { + getrange = getrange_slow; + argv++; + } + if (argv[1] && strcmp(argv[1], "--parallel") == 0) - return parallel_test(vdev.features[0]); + return parallel_test(vdev.features[0], getrange); if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0) abort(); @@ -382,7 +408,7 @@ int main(int argc, char *argv[]) vrh.vring.desc, vrh.vring.avail, vrh.vring.used); /* No descriptor to get yet... */ - err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange_iov, + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head, GFP_KERNEL); if (err != 0) errx(1, "vringh_getdesc_user: %i", err); @@ -410,7 +436,7 @@ int main(int argc, char *argv[]) wiov.max = ARRAY_SIZE(host_wiov); wiov.allocated = false; - err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange_iov, + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head, GFP_KERNEL); if (err != 1) errx(1, "vringh_getdesc_user: %i", err); @@ -418,9 +444,17 @@ int main(int argc, char *argv[]) assert(riov.max == 1); assert(riov.iov[0].iov_base == __user_addr_max - 1); assert(riov.iov[0].iov_len == 1); - assert(wiov.max == 1); - assert(wiov.iov[0].iov_base == __user_addr_max - 3); - assert(wiov.iov[0].iov_len == 2); + if (getrange != getrange_slow) { + assert(wiov.max == 1); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 2); + } else { + assert(wiov.max == 2); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 1); + assert(wiov.iov[1].iov_base == __user_addr_max - 2); + assert(wiov.iov[1].iov_len == 1); + } err = vringh_iov_pull_user(&riov, buf, 5); if (err != 1) @@ -434,7 +468,7 @@ int main(int argc, char *argv[]) if (err != 2) errx(1, "vringh_iov_push_user: %i", err); assert(memcmp(__user_addr_max - 3, "bc", 2) == 0); - assert(wiov.i == 1); + assert(wiov.i == wiov.max); assert(vringh_iov_push_user(&wiov, buf, 5) == 0); /* Host is done. */ @@ -477,14 +511,17 @@ int main(int argc, char *argv[]) wiov.max = ARRAY_SIZE(host_wiov); wiov.allocated = false; - err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange_iov, + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head, GFP_KERNEL); if (err != 1) errx(1, "vringh_getdesc_user: %i", err); assert(riov.allocated); assert(riov.iov != host_riov); - assert(riov.max == RINGSIZE); + if (getrange != getrange_slow) + assert(riov.max == RINGSIZE); + else + assert(riov.max == RINGSIZE * USER_MEM/4); assert(!wiov.allocated); assert(wiov.max == 0); @@ -567,7 +604,7 @@ int main(int argc, char *argv[]) wiov.max = ARRAY_SIZE(host_wiov); wiov.allocated = false; - err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange_iov, + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head, GFP_KERNEL); if (err != 1) errx(1, "vringh_getdesc_user: %i", err); @@ -575,7 +612,10 @@ int main(int argc, char *argv[]) if (head != n) errx(1, "vringh_getdesc_user: head %i not %i", head, n); - assert(riov.max == 7); + if (getrange != getrange_slow) + assert(riov.max == 7); + else + assert(riov.max == 28); assert(riov.allocated); err = vringh_iov_pull_user(&riov, buf, 29); assert(err == 28); _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization