> -----Original Message----- > From: Michael Kelley (LINUX) <mikelley@xxxxxxxxxxxxx> > Sent: Thursday, August 3, 2023 3:14 AM > To: Saurabh Sengar <ssengar@xxxxxxxxxxxxxxxxxxx>; KY Srinivasan > <kys@xxxxxxxxxxxxx>; Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>; > wei.liu@xxxxxxxxxx; Dexuan Cui <decui@xxxxxxxxxxxxx>; > gregkh@xxxxxxxxxxxxxxxxxxx; corbet@xxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; > linux-hyperv@xxxxxxxxxxxxxxx; linux-doc@xxxxxxxxxxxxxxx > Subject: [EXTERNAL] RE: [PATCH v3 2/3] tools: hv: Add vmbus_bufring > > From: Saurabh Sengar <ssengar@xxxxxxxxxxxxxxxxxxx> Sent: Friday, July 14, > 2023 3:26 AM > > > > Provide a userspace interface for userspace drivers or applications to > > read/write a VMBus ringbuffer. A significant part of this code is > > borrowed from DPDK[1]. Current library is supported exclusively for > > the x86 architecture. > > > > To build this library: > > make -C tools/hv libvmbus_bufring.a > > > > Applications using this library can include the vmbus_bufring.h header > > file and libvmbus_bufring.a statically. > > > > [1] > > https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith > > > ub.com%2FDPDK%2Fdpdk%2F&data=05%7C01%7Cssengar%40microsoft.com > %7C7aa6d > > > 4dbbcb44895db5008db93a193c9%7C72f988bf86f141af91ab2d7cd011db47%7 > C1%7C0 > > > %7C638266094508922046%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLj > AwMDAiLCJQ > > > IjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata > =O0cvl > > EWlbNS51VoaBHo5l2wWDDjAFJVdfDeT3t%2FR36Y%3D&reserved=0 > > > > Signed-off-by: Mary Hardy <maryhardy@xxxxxxxxxxxxx> > > Signed-off-by: Saurabh Sengar <ssengar@xxxxxxxxxxxxxxxxxxx> > > --- > > [V3] > > - Made ring buffer data offset depend on page size > > - remove rte_smp_rwmb macro and reused rte_compiler_barrier instead > > - Added legal counsel sign-off > > - Removed "Link:" tag > > - Improve commit messages > > - new library compilation dependent on x86 > > - simplify mmap > > > > [V2] > > - simpler sysfs path, less parsing > > > > tools/hv/Build | 1 + > > tools/hv/Makefile | 13 +- > > tools/hv/vmbus_bufring.c | 297 > > +++++++++++++++++++++++++++++++++++++++ > > tools/hv/vmbus_bufring.h | 154 ++++++++++++++++++++ > > 4 files changed, 464 insertions(+), 1 deletion(-) create mode 100644 > > tools/hv/vmbus_bufring.c create mode 100644 tools/hv/vmbus_bufring.h > > > > diff --git a/tools/hv/Build b/tools/hv/Build index > > 6cf51fa4b306..2a667d3d94cb 100644 > > --- a/tools/hv/Build > > +++ b/tools/hv/Build > > @@ -1,3 +1,4 @@ > > hv_kvp_daemon-y += hv_kvp_daemon.o > > hv_vss_daemon-y += hv_vss_daemon.o > > hv_fcopy_daemon-y += hv_fcopy_daemon.o > > +vmbus_bufring-y += vmbus_bufring.o > > diff --git a/tools/hv/Makefile b/tools/hv/Makefile index > > fe770e679ae8..33cf488fd20f 100644 > > --- a/tools/hv/Makefile > > +++ b/tools/hv/Makefile > > @@ -11,14 +11,19 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR))) > > srctree := $(patsubst %/,%,$(dir $(srctree))) endif > > > > +include $(srctree)/tools/scripts/Makefile.arch > > + > > # Do not use make's built-in rules > > # (this improves performance and avoids hard-to-debug behaviour); > > MAKEFLAGS += -r > > > > override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include > > > > +ifeq ($(SRCARCH),x86) > > +ALL_LIBS := libvmbus_bufring.a > > +endif > > ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon > > -ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) > > +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) $(patsubst > > %,$(OUTPUT)%,$(ALL_LIBS)) > > > > ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh > > hv_set_ifconfig.sh > > > > @@ -27,6 +32,12 @@ all: $(ALL_PROGRAMS) export srctree OUTPUT CC LD > > CFLAGS include $(srctree)/tools/build/Makefile.include > > > > +HV_VMBUS_BUFRING_IN := $(OUTPUT)vmbus_bufring.o > > +$(HV_VMBUS_BUFRING_IN): FORCE > > + $(Q)$(MAKE) $(build)=vmbus_bufring > > +$(OUTPUT)libvmbus_bufring.a : vmbus_bufring.o > > + $(AR) rcs $@ $^ > > + > > HV_KVP_DAEMON_IN := $(OUTPUT)hv_kvp_daemon-in.o > > $(HV_KVP_DAEMON_IN): FORCE > > $(Q)$(MAKE) $(build)=hv_kvp_daemon > > diff --git a/tools/hv/vmbus_bufring.c b/tools/hv/vmbus_bufring.c new > > file mode 100644 index 000000000000..fb1f0489c625 > > --- /dev/null > > +++ b/tools/hv/vmbus_bufring.c > > @@ -0,0 +1,297 @@ > > +// SPDX-License-Identifier: BSD-3-Clause > > +/* > > + * Copyright (c) 2009-2012,2016,2023 Microsoft Corp. > > + * Copyright (c) 2012 NetApp Inc. > > + * Copyright (c) 2012 Citrix Inc. > > + * All rights reserved. > > + */ > > + > > +#include <errno.h> > > +#include <emmintrin.h> > > +#include <stdio.h> > > +#include <string.h> > > +#include <unistd.h> > > +#include <sys/uio.h> > > +#include "vmbus_bufring.h" > > + > > +#define rte_compiler_barrier() ({ asm volatile ("" : : : "memory"); }) > > +#define RINGDATA_START_OFFSET (getpagesize()) > > +#define VMBUS_RQST_ERROR 0xFFFFFFFFFFFFFFFF > > +#define ALIGN(val, align) ((typeof(val))((val) & (~((typeof(val))((align) - > 1))))) > > + > > +/* Increase bufring index by inc with wraparound */ static inline > > +uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz) { > > + idx += inc; > > + if (idx >= sz) > > + idx -= sz; > > + > > + return idx; > > +} > > + > > +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int > > +blen) { > > + br->vbr = buf; > > + br->windex = br->vbr->windex; > > + br->dsize = blen - RINGDATA_START_OFFSET; } > > + > > +static inline __always_inline void > > +rte_smp_mb(void) > > +{ > > + asm volatile("lock addl $0, -128(%%rsp); " ::: "memory"); } > > + > > +static inline int > > +rte_atomic32_cmpset(volatile uint32_t *dst, uint32_t exp, uint32_t > > +src) { > > + uint8_t res; > > + > > + asm volatile("lock ; " > > + "cmpxchgl %[src], %[dst];" > > + "sete %[res];" > > + : [res] "=a" (res), /* output */ > > + [dst] "=m" (*dst) > > + : [src] "r" (src), /* input */ > > + "a" (exp), > > + "m" (*dst) > > + : "memory"); /* no-clobber list */ > > + return res; > > +} > > + > > +static inline uint32_t > > +vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex, > > + const void *src0, uint32_t cplen) { > > + uint8_t *br_data = (uint8_t *)tbr->vbr + RINGDATA_START_OFFSET; > > + uint32_t br_dsize = tbr->dsize; > > + const uint8_t *src = src0; > > + > > + if (cplen > br_dsize - windex) { > > + uint32_t fraglen = br_dsize - windex; > > + > > + /* Wrap-around detected */ > > + memcpy(br_data + windex, src, fraglen); > > + memcpy(br_data, src + fraglen, cplen - fraglen); > > + } else { > > + memcpy(br_data + windex, src, cplen); > > + } > > + > > + return vmbus_br_idxinc(windex, cplen, br_dsize); } > > + > > +/* > > + * Write scattered channel packet to TX bufring. > > + * > > + * The offset of this channel packet is written as a 64bits value > > + * immediately after this channel packet. > > + * > > + * The write goes through three stages: > > + * 1. Reserve space in ring buffer for the new data. > > + * Writer atomically moves priv_write_index. > > + * 2. Copy the new data into the ring. > > + * 3. Update the tail of the ring (visible to host) that indicates > > + * next read location. Writer updates write_index > > + */ > > +static int > > +vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen, > > + bool *need_sig) > > +{ > > + struct vmbus_bufring *vbr = tbr->vbr; > > + uint32_t ring_size = tbr->dsize; > > + uint32_t old_windex, next_windex, windex, total; > > + uint64_t save_windex; > > + int i; > > + > > + total = 0; > > + for (i = 0; i < iovlen; i++) > > + total += iov[i].iov_len; > > + total += sizeof(save_windex); > > + > > + /* Reserve space in ring */ > > + do { > > + uint32_t avail; > > + > > + /* Get current free location */ > > + old_windex = tbr->windex; > > + > > + /* Prevent compiler reordering this with calculation */ > > + rte_compiler_barrier(); > > + > > + avail = vmbus_br_availwrite(tbr, old_windex); > > + > > + /* If not enough space in ring, then tell caller. */ > > + if (avail <= total) > > + return -EAGAIN; > > + > > + next_windex = vmbus_br_idxinc(old_windex, total, ring_size); > > + > > + /* Atomic update of next write_index for other threads */ > > + } while (!rte_atomic32_cmpset(&tbr->windex, old_windex, > > +next_windex)); > > + > > + /* Space from old..new is now reserved */ > > + windex = old_windex; > > + for (i = 0; i < iovlen; i++) > > + windex = vmbus_txbr_copyto(tbr, windex, iov[i].iov_base, > > +iov[i].iov_len); > > + > > + /* Set the offset of the current channel packet. */ > > + save_windex = ((uint64_t)old_windex) << 32; > > + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, > > + sizeof(save_windex)); > > + > > + /* The region reserved should match region used */ > > + if (windex != next_windex) > > + return -EINVAL; > > + > > + /* Ensure that data is available before updating host index */ > > + rte_compiler_barrier(); > > + > > + /* Checkin for our reservation. wait for our turn to update host */ > > + while (!rte_atomic32_cmpset(&vbr->windex, old_windex, > next_windex)) > > + _mm_pause(); > > + > > + return 0; > > +} > > + > > +int rte_vmbus_chan_send(struct vmbus_br *txbr, uint16_t type, void > *data, > > + uint32_t dlen, uint32_t flags) > > +{ > > + struct vmbus_chanpkt pkt; > > + unsigned int pktlen, pad_pktlen; > > + const uint32_t hlen = sizeof(pkt); > > + bool send_evt = false; > > + uint64_t pad = 0; > > + struct iovec iov[3]; > > + int error; > > + > > + pktlen = hlen + dlen; > > + pad_pktlen = ALIGN(pktlen, sizeof(uint64_t)); > > This ALIGN function rounds down. So pad_pktlen could be less than pktlen. Thanks for pointing this, will fix. > > > + > > + pkt.hdr.type = type; > > + pkt.hdr.flags = flags; > > + pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT; > > + pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT; > > + pkt.hdr.xactid = VMBUS_RQST_ERROR; /* doesn't support multiple > > +requests at same time */ > > + > > + iov[0].iov_base = &pkt; > > + iov[0].iov_len = hlen; > > + iov[1].iov_base = data; > > + iov[1].iov_len = dlen; > > + iov[2].iov_base = &pad; > > + iov[2].iov_len = pad_pktlen - pktlen; > > Given the way your ALIGN function works, the above could produce a > negative value for iov[2].iov_len. Then bad things will happen. :-( Got it. > > > + > > + error = vmbus_txbr_write(txbr, iov, 3, &send_evt); > > + > > + return error; > > +} > > + <snip> > > + * we can request that the receiver interrupt the sender > > + * when the ring transitions from being full to being able > > + * to handle a message of size "pending_send_sz". > > + * > > + * Add necessary state for this enhancement. > > + */ > > + volatile uint32_t pending_send; > > + uint32_t reserved1[12]; > > + > > + union { > > + struct { > > + uint32_t feat_pending_send_sz:1; > > + }; > > + uint32_t value; > > + } feature_bits; > > + > > + /* > > + * Ring data starts here + RingDataStartOffset > > This mention of RingDataStartOffset looks stale. I could not find it defined > anywhere. Will correct it to: Ring data starts after PAGE_SIZE offset from the start of this struct (RINGDATA_START_OFFSET). - Saurabh