On Fri, Jun 11, 2021 at 09:20:51AM +0000, liweihang wrote: > On 2021/6/4 22:50, Jason Gunthorpe wrote: > > On Fri, May 28, 2021 at 05:32:59PM +0800, Weihang Li wrote: > >> diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c > >> index aa57cc4..28d455b 100644 > >> +++ b/providers/hns/hns_roce_u_hw_v2.c > >> @@ -33,10 +33,15 @@ > >> #define _GNU_SOURCE > >> #include <stdio.h> > >> #include <string.h> > >> +#include <sys/mman.h> > >> #include "hns_roce_u.h" > >> #include "hns_roce_u_db.h" > >> #include "hns_roce_u_hw_v2.h" > >> > >> +#if defined(__aarch64__) || defined(__arm__) > >> +#include <arm_neon.h> > >> +#endif > >> + > >> #define HR_IBV_OPC_MAP(ib_key, hr_key) \ > >> [IBV_WR_ ## ib_key] = HNS_ROCE_WQE_OP_ ## hr_key > >> > >> @@ -313,6 +318,39 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, > >> (__le32 *)&sq_db); > >> } > >> > >> +static inline void hns_roce_write512(uint64_t *dest, uint64_t *val) > >> +{ > >> +#if defined(__aarch64__) || defined(__arm__) > >> + uint64x2x4_t dwqe; > >> + > >> + /* Load multiple 4-element structures to 4 registers */ > >> + dwqe = vld4q_u64(val); > >> + /* store multiple 4-element structures from 4 registers */ > >> + vst4q_u64(dest, dwqe); > >> +#else > >> + int i; > >> + > >> + for (i = 0; i < HNS_ROCE_WRITE_TIMES; i++) > >> + hns_roce_write64(dest + i, val + HNS_ROCE_WORD_NUM * i); > >> +#endif > >> +} > > > > No code like this in providers. This should be done similiarly to how > > SSE is handled on x86 > > > > This is > > > > mmio_memcpy_x64(dest, val, 64); > > > > The above should be conditionalized to trigger NEON > > > > #if defined(__aarch64__) || defined(__arm__) > > static inline void __mmio_memcpy_x64_64b(..) > > {.. > > vst4q_u64(dest, vld4q_u64(src)) > > ..} > > #endif > > > > #define mmio_memcpy_x64(dest, src, bytecount) > > ({if (__builtin_constant_p(bytecount == 64) > > __mmio_memcpy_x64_64b(dest,src,bytecount) > > ... > > > > OK, thank you. > > > And I'm not sure what barriers you need for prot_device, but certainly > > more than none. If you don't know then use the WC barriers > > > > ST4 instructions can guarantee the 64 bytes data to be wrote at a time, so we > don't need a barrier. arm is always a relaxed out of order storage model, you need barriers to ensure that the observance of the ST4 is in-order with the other writes that might be going on Jason