On 2014/12/15 21:29, Arnd Bergmann wrote: > On Thursday 11 December 2014 19:42:27 Ding Tianhong wrote: >> v9: >> - There is no tx completion interrupts to free DMAd Tx packets, it means taht >> we rely on new tx packets arriving to run the destructors of completed packets, >> which open up space in their sockets's send queues. Sometimes we don't get such >> new packets causing Tx to stall, a single UDP transmitter is a good example of >> this situation, so we need a clean up workqueue to reclaims completed packets, >> the workqueue will only free the last packets which is already stay for several jiffies. >> Also fix some format cleanups. > > You must have missed the reply in which David Miller explained why > you can't call skb_orphan and rely on an occasional cleanup of the > queue. Please use something like the patch below: > > - drop the broken skb_orphan call > - drop the workqueue > - batch cleanup based on tx_coalesce_frames/usecs for better throughput > - use a reasonable default tx timeout (200us, could be shorted > based on measurements) with a range timer > - fix napi poll function return value > - use a lockless queue for cleanup > Ok, agree, my comments see below. > Signed-off-by: Arnd Bergmann <arnd@xxxxxxxx> > > Feel free to fold this into your patch rather than keeping it separate. > Completely untested, probably contains some bugs. Please ask if you > have questions about this. > > diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c > index 9d37b670db6a..d85c5287654e 100644 > --- a/drivers/net/ethernet/hisilicon/hip04_eth.c > +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c > @@ -12,6 +12,7 @@ > #include <linux/etherdevice.h> > #include <linux/platform_device.h> > #include <linux/interrupt.h> > +#include <linux/ktime.h> > #include <linux/of_address.h> > #include <linux/phy.h> > #include <linux/of_mdio.h> > @@ -157,9 +158,11 @@ struct hip04_priv { > struct sk_buff *tx_skb[TX_DESC_NUM]; > dma_addr_t tx_phys[TX_DESC_NUM]; > unsigned int tx_head; > - unsigned int tx_tail; > - int tx_count; > - unsigned long last_tx; > + > + /* FIXME: make these adjustable through ethtool */ > + int tx_coalesce_frames; > + int tx_coalesce_usecs; > + struct hrtimer tx_coalesce_timer; > > unsigned char *rx_buf[RX_DESC_NUM]; > dma_addr_t rx_phys[RX_DESC_NUM]; > @@ -171,10 +174,15 @@ struct hip04_priv { > struct regmap *map; > struct work_struct tx_timeout_task; > > - struct workqueue_struct *wq; > - struct delayed_work tx_clean_task; > + /* written only by tx cleanup */ > + unsigned int tx_tail ____cacheline_aligned_in_smp; > }; > > +static inline unsigned int tx_count(unsigned int head, unsigned int tail) > +{ > + return (head - tail) % (TX_DESC_NUM - 1); > +} > + > static void hip04_config_port(struct net_device *ndev, u32 speed, u32 duplex) > { > struct hip04_priv *priv = netdev_priv(ndev); > @@ -351,18 +359,21 @@ static int hip04_set_mac_address(struct net_device *ndev, void *addr) > return 0; > } > > -static void hip04_tx_reclaim(struct net_device *ndev, bool force) > +static int hip04_tx_reclaim(struct net_device *ndev, bool force) > { > struct hip04_priv *priv = netdev_priv(ndev); > unsigned tx_tail = priv->tx_tail; > struct tx_desc *desc; > unsigned int bytes_compl = 0, pkts_compl = 0; > + unsigned int count; > > - if (priv->tx_count == 0) > + smp_rmb(); > + count = tx_count(ACCESS_ONCE(priv->tx_head), tx_tail); > + if (count == 0) > goto out; > > - while ((tx_tail != priv->tx_head) || (priv->tx_count == TX_DESC_NUM)) { > - desc = &priv->tx_desc[priv->tx_tail]; > + while (count) { > + desc = &priv->tx_desc[tx_tail]; > if (desc->send_addr != 0) { > if (force) > desc->send_addr = 0; > @@ -381,57 +392,37 @@ static void hip04_tx_reclaim(struct net_device *ndev, bool force) > dev_kfree_skb(priv->tx_skb[tx_tail]); > priv->tx_skb[tx_tail] = NULL; > tx_tail = TX_NEXT(tx_tail); > - priv->tx_count--; > - > - if (priv->tx_count <= 0) > - break; > + count--; > } > > priv->tx_tail = tx_tail; > + smp_wmb(); /* Ensure tx_tail visible to xmit */ > > - /* Ensure tx_tail & tx_count visible to xmit */ > - smp_mb(); > out: > - > if (pkts_compl || bytes_compl) > netdev_completed_queue(ndev, pkts_compl, bytes_compl); > > - if (unlikely(netif_queue_stopped(ndev)) && > - (priv->tx_count < TX_DESC_NUM)) > + if (unlikely(netif_queue_stopped(ndev)) && (count < (TX_DESC_NUM - 1))) > netif_wake_queue(ndev); > -} > > -static void hip04_tx_clean_monitor(struct work_struct *work) > -{ > - struct hip04_priv *priv = container_of(work, struct hip04_priv, > - tx_clean_task.work); > - struct net_device *ndev = priv->ndev; > - int delta_in_ticks = msecs_to_jiffies(1000); > - > - if (!time_in_range(jiffies, priv->last_tx, > - priv->last_tx + delta_in_ticks)) { > - netif_tx_lock(ndev); > - hip04_tx_reclaim(ndev, false); > - netif_tx_unlock(ndev); > - } > - queue_delayed_work(priv->wq, &priv->tx_clean_task, delta_in_ticks); > + return count; I think should return pkts_compl, because may break from the loop, the pkts_compl may smaller than count. and we need to add netif_tx_lock() to protect this function to avoid concurrency conflict. > } > > static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) > { > struct hip04_priv *priv = netdev_priv(ndev); > struct net_device_stats *stats = &ndev->stats; > - unsigned int tx_head = priv->tx_head; > + unsigned int tx_head = priv->tx_head, count; > struct tx_desc *desc = &priv->tx_desc[tx_head]; > dma_addr_t phys; > > - if (priv->tx_count >= TX_DESC_NUM) { > + smp_rmb(); > + count = tx_count(tx_head, ACCESS_ONCE(priv->tx_tail)); > + if (count == (TX_DESC_NUM - 1)) { > netif_stop_queue(ndev); > return NETDEV_TX_BUSY; > } > > - hip04_tx_reclaim(ndev, false); > - > phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE); > if (dma_mapping_error(&ndev->dev, phys)) { > dev_kfree_skb(skb); > @@ -447,20 +438,33 @@ static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) > desc->wb_addr = cpu_to_be32(phys); > skb_tx_timestamp(skb); > > - /* Don't wait up for transmitted skbs to be freed. */ > - skb_orphan(skb); > - > + /* FIXME: eliminate this mmio access if xmit_more is set */ > hip04_set_xmit_desc(priv, phys); > priv->tx_head = TX_NEXT(tx_head); > + count++; > netdev_sent_queue(ndev, skb->len); > > stats->tx_bytes += skb->len; > stats->tx_packets++; > - priv->tx_count++; > - priv->last_tx = jiffies; > > - /* Ensure tx_head & tx_count update visible to tx reclaim */ > - smp_mb(); > + /* Ensure tx_head update visible to tx reclaim */ > + smp_wmb(); > + > + /* queue is getting full, better start cleaning up now */ > + if (count >= priv->tx_coalesce_frames) { > + if (napi_schedule_prep(&priv->napi)) { > + /* disable rx interrupt and timer */ > + priv->reg_inten &= ~(RCV_INT); > + writel_relaxed(DEF_INT_MASK & ~RCV_INT, > + priv->base + PPE_INTEN); > + hrtimer_cancel(&priv->tx_coalesce_timer); > + __napi_schedule(&priv->napi); > + } > + } else if (!hrtimer_is_queued(&priv->tx_coalesce_timer)) { > + /* cleanup not pending yet, start a new timer */ > + hrtimer_start_expires(&priv->tx_coalesce_timer, > + HRTIMER_MODE_REL); > + } > > return NETDEV_TX_OK; > } > @@ -477,6 +481,7 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget) > bool last = false; > dma_addr_t phys; > int rx = 0; > + int tx_remaining; > u16 len; > u32 err; > > @@ -513,11 +518,11 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget) > > buf = netdev_alloc_frag(priv->rx_buf_size); > if (!buf) > - return -ENOMEM; > + goto done; > phys = dma_map_single(&ndev->dev, buf, > RX_BUF_SIZE, DMA_FROM_DEVICE); > if (dma_mapping_error(&ndev->dev, phys)) > - return -EIO; > + goto done; > priv->rx_buf[priv->rx_head] = buf; > priv->rx_phys[priv->rx_head] = phys; > hip04_set_recv_desc(priv, phys); > @@ -537,6 +542,11 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget) > } > napi_complete(napi); > done: > + /* clean up tx descriptors and start a new timer if necessary */ > + tx_remaining = hip04_tx_reclaim(ndev, false); > + if (rx < budget && tx_remaining) > + hrtimer_start_expires(&priv->tx_coalesce_timer, HRTIMER_MODE_REL); > + > return rx; > } > > @@ -547,6 +557,9 @@ static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id) > struct net_device_stats *stats = &ndev->stats; > u32 ists = readl_relaxed(priv->base + PPE_INTSTS); > > + if (!ists) > + return IRQ_NONE; > + > writel_relaxed(DEF_INT_MASK, priv->base + PPE_RINT); > > if (unlikely(ists & DEF_INT_ERR)) { > @@ -560,16 +573,32 @@ static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id) > } > } > > - if (ists & RCV_INT) { > + if (ists & RCV_INT && napi_schedule_prep(&priv->napi)) { > /* disable rx interrupt */ > priv->reg_inten &= ~(RCV_INT); > - writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN); > - napi_schedule(&priv->napi); > + writel_relaxed(DEF_INT_MASK & ~RCV_INT, priv->base + PPE_INTEN); > + hrtimer_cancel(&priv->tx_coalesce_timer); > + __napi_schedule(&priv->napi); > } > > return IRQ_HANDLED; > } > > +enum hrtimer_restart tx_done(struct hrtimer *hrtimer) > +{ > + struct hip04_priv *priv; > + priv = container_of(hrtimer, struct hip04_priv, tx_coalesce_timer); > + > + if (napi_schedule_prep(&priv->napi)) { > + /* disable rx interrupt */ > + priv->reg_inten &= ~(RCV_INT); > + writel_relaxed(DEF_INT_MASK & ~RCV_INT, priv->base + PPE_INTEN); > + __napi_schedule(&priv->napi); > + } > + > + return HRTIMER_NORESTART; > +} > + > static void hip04_adjust_link(struct net_device *ndev) > { > struct hip04_priv *priv = netdev_priv(ndev); > @@ -589,7 +618,6 @@ static int hip04_mac_open(struct net_device *ndev) > priv->rx_head = 0; > priv->tx_head = 0; > priv->tx_tail = 0; > - priv->tx_count = 0; > hip04_reset_ppe(priv); > > for (i = 0; i < RX_DESC_NUM; i++) { > @@ -612,9 +640,6 @@ static int hip04_mac_open(struct net_device *ndev) > hip04_mac_enable(ndev); > napi_enable(&priv->napi); > > - INIT_DELAYED_WORK(&priv->tx_clean_task, hip04_tx_clean_monitor); > - queue_delayed_work(priv->wq, &priv->tx_clean_task, 0); > - > return 0; > } > > @@ -623,8 +648,6 @@ static int hip04_mac_stop(struct net_device *ndev) > struct hip04_priv *priv = netdev_priv(ndev); > int i; > > - cancel_delayed_work_sync(&priv->tx_clean_task); > - I think we should cancle the hrtimer when closed and queue the timer when open. > napi_disable(&priv->napi); > netif_stop_queue(ndev); > hip04_mac_disable(ndev); > @@ -725,6 +748,7 @@ static int hip04_mac_probe(struct platform_device *pdev) > struct hip04_priv *priv; > struct resource *res; > unsigned int irq; > + ktime_t txtime; > int ret; > > ndev = alloc_etherdev(sizeof(struct hip04_priv)); > @@ -751,6 +775,21 @@ static int hip04_mac_probe(struct platform_device *pdev) > priv->port = arg.args[0]; > priv->chan = arg.args[1] * RX_DESC_NUM; > > + hrtimer_init(&priv->tx_coalesce_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + > + /* > + * BQL will try to keep the TX queue as short as possible, but it can't > + * be faster than tx_coalesce_usecs, so we need a fast timeout here, > + * but also long enough to gather up enough frames to ensure we don't > + * get more interrupts than necessary. > + * 200us is enough for 16 frames of 1500 bytes at gigabit ethernet rate > + */ > + priv->tx_coalesce_frames = TX_DESC_NUM * 3 / 4; > + priv->tx_coalesce_usecs = 200; > + /* allow timer to fire after half the time at the earliest */ > + txtime = ktime_set(0, priv->tx_coalesce_usecs * NSEC_PER_USEC / 2); > + hrtimer_set_expires_range(&priv->tx_coalesce_timer, txtime, txtime); > + I think miss the line: priv->tx_coalesce_timer.function = tx_done; Regards Ding > priv->map = syscon_node_to_regmap(arg.np); > if (IS_ERR(priv->map)) { > dev_warn(d, "no syscon hisilicon,hip04-ppe\n"); > @@ -788,12 +827,6 @@ static int hip04_mac_probe(struct platform_device *pdev) > } > } > > - priv->wq = create_singlethread_workqueue(ndev->name); > - if (!priv->wq) { > - ret = -ENOMEM; > - goto init_fail; > - } > - > INIT_WORK(&priv->tx_timeout_task, hip04_tx_timeout_task); > > ether_setup(ndev); > @@ -848,8 +881,6 @@ static int hip04_remove(struct platform_device *pdev) > free_irq(ndev->irq, ndev); > of_node_put(priv->phy_node); > cancel_work_sync(&priv->tx_timeout_task); > - if (priv->wq) > - destroy_workqueue(priv->wq); > free_netdev(ndev); > > return 0; > > > . > -- To unsubscribe from this list: send the line "unsubscribe devicetree" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html