On Fri, Mar 03, 2023 at 08:21:15AM +0000, Kai wrote: > The refcount wr_tx_refcnt may cause cache thrashing problems among > cores and we can use percpu ref to mitigate this issue here. We > gain some performance improvement with percpu ref here on our > customized smc-r verion. Applying cache alignment may also mitigate > this problem but it seem more reasonable to use percpu ref here. > We can also replace wr_reg_refcnt with one percpu reference like > wr_tx_refcnt. > > redis-benchmark on smc-r with atomic wr_tx_refcnt: > SET: 525817.62 requests per second, p50=0.087 msec > GET: 570841.44 requests per second, p50=0.087 msec > > redis-benchmark on the percpu_ref version: > SET: 539956.81 requests per second, p50=0.087 msec > GET: 587613.12 requests per second, p50=0.079 msec Does the test data need to be refreshed? > > Signed-off-by: Kai <KaiShen@xxxxxxxxxxxxxxxxx> > > v1->v2: > - Modify patch prefix > > v2->v3: > - Make wr_reg_refcnt a percpu one as well > - Init percpu ref with 0 flag instead of ALLOW_REINIT flag > --- > net/smc/smc_core.h | 10 ++++++++-- > net/smc/smc_wr.c | 35 ++++++++++++++++++++++++++++------- > net/smc/smc_wr.h | 5 ++--- > 3 files changed, 38 insertions(+), 12 deletions(-) > > diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h > index 08b457c2d294..1645fba0d2d3 100644 > --- a/net/smc/smc_core.h > +++ b/net/smc/smc_core.h > @@ -106,7 +106,10 @@ struct smc_link { > unsigned long *wr_tx_mask; /* bit mask of used indexes */ > u32 wr_tx_cnt; /* number of WR send buffers */ > wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ > - atomic_t wr_tx_refcnt; /* tx refs to link */ > + struct { > + struct percpu_ref wr_tx_refs; > + } ____cacheline_aligned_in_smp; > + struct completion tx_ref_comp; For the variable names suffixed with wr_*_refs, should we continue to use wr_*_refcnt? Thanks. > > struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ > struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ > @@ -122,7 +125,10 @@ struct smc_link { > > struct ib_reg_wr wr_reg; /* WR register memory region */ > wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ > - atomic_t wr_reg_refcnt; /* reg refs to link */ > + struct { > + struct percpu_ref wr_reg_refs; > + } ____cacheline_aligned_in_smp; > + struct completion reg_ref_comp; > enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ > > u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c > index b0678a417e09..0021065a600a 100644 > --- a/net/smc/smc_wr.c > +++ b/net/smc/smc_wr.c > @@ -377,12 +377,11 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) > if (rc) > return rc; > > - atomic_inc(&link->wr_reg_refcnt); > + percpu_ref_get(&link->wr_reg_refs); > rc = wait_event_interruptible_timeout(link->wr_reg_wait, > (link->wr_reg_state != POSTED), > SMC_WR_REG_MR_WAIT_TIME); > - if (atomic_dec_and_test(&link->wr_reg_refcnt)) > - wake_up_all(&link->wr_reg_wait); > + percpu_ref_put(&link->wr_reg_refs); > if (!rc) { > /* timeout - terminate link */ > smcr_link_down_cond_sched(link); > @@ -647,8 +646,10 @@ void smc_wr_free_link(struct smc_link *lnk) > smc_wr_wakeup_tx_wait(lnk); > > smc_wr_tx_wait_no_pending_sends(lnk); > - wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); > - wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); > + percpu_ref_kill(&lnk->wr_reg_refs); > + wait_for_completion(&lnk->reg_ref_comp); > + percpu_ref_kill(&lnk->wr_tx_refs); > + wait_for_completion(&lnk->tx_ref_comp); > > if (lnk->wr_rx_dma_addr) { > ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, > @@ -847,6 +848,20 @@ void smc_wr_add_dev(struct smc_ib_device *smcibdev) > tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); > } > > +static void smcr_wr_tx_refs_free(struct percpu_ref *ref) > +{ > + struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs); > + > + complete(&lnk->tx_ref_comp); > +} > + > +static void smcr_wr_reg_refs_free(struct percpu_ref *ref) > +{ > + struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs); > + > + complete(&lnk->reg_ref_comp); > +} > + > int smc_wr_create_link(struct smc_link *lnk) > { > struct ib_device *ibdev = lnk->smcibdev->ibdev; > @@ -890,9 +905,15 @@ int smc_wr_create_link(struct smc_link *lnk) > smc_wr_init_sge(lnk); > bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); > init_waitqueue_head(&lnk->wr_tx_wait); > - atomic_set(&lnk->wr_tx_refcnt, 0); > + rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL); > + if (rc) > + goto dma_unmap; > + init_completion(&lnk->tx_ref_comp); > init_waitqueue_head(&lnk->wr_reg_wait); > - atomic_set(&lnk->wr_reg_refcnt, 0); > + rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL); > + if (rc) > + goto dma_unmap; > + init_completion(&lnk->reg_ref_comp); > init_waitqueue_head(&lnk->wr_rx_empty_wait); > return rc; > > diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h > index 45e9b894d3f8..f3008dda222a 100644 > --- a/net/smc/smc_wr.h > +++ b/net/smc/smc_wr.h > @@ -63,14 +63,13 @@ static inline bool smc_wr_tx_link_hold(struct smc_link *link) > { > if (!smc_link_sendable(link)) > return false; > - atomic_inc(&link->wr_tx_refcnt); > + percpu_ref_get(&link->wr_tx_refs); > return true; > } > > static inline void smc_wr_tx_link_put(struct smc_link *link) > { > - if (atomic_dec_and_test(&link->wr_tx_refcnt)) > - wake_up_all(&link->wr_tx_wait); > + percpu_ref_put(&link->wr_tx_refs); > } > > static inline void smc_wr_drain_cq(struct smc_link *lnk) > -- > 2.31.1