On Thursday 11 December 2014 19:42:27 Ding Tianhong wrote: > v9: > - There is no tx completion interrupts to free DMAd Tx packets, it means taht > we rely on new tx packets arriving to run the destructors of completed packets, > which open up space in their sockets's send queues. Sometimes we don't get such > new packets causing Tx to stall, a single UDP transmitter is a good example of > this situation, so we need a clean up workqueue to reclaims completed packets, > the workqueue will only free the last packets which is already stay for several jiffies. > Also fix some format cleanups. You must have missed the reply in which David Miller explained why you can't call skb_orphan and rely on an occasional cleanup of the queue. Please use something like the patch below: - drop the broken skb_orphan call - drop the workqueue - batch cleanup based on tx_coalesce_frames/usecs for better throughput - use a reasonable default tx timeout (200us, could be shorted based on measurements) with a range timer - fix napi poll function return value - use a lockless queue for cleanup Signed-off-by: Arnd Bergmann <arnd@xxxxxxxx> Feel free to fold this into your patch rather than keeping it separate. Completely untested, probably contains some bugs. Please ask if you have questions about this. diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index 9d37b670db6a..d85c5287654e 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -12,6 +12,7 @@ #include <linux/etherdevice.h> #include <linux/platform_device.h> #include <linux/interrupt.h> +#include <linux/ktime.h> #include <linux/of_address.h> #include <linux/phy.h> #include <linux/of_mdio.h> @@ -157,9 +158,11 @@ struct hip04_priv { struct sk_buff *tx_skb[TX_DESC_NUM]; dma_addr_t tx_phys[TX_DESC_NUM]; unsigned int tx_head; - unsigned int tx_tail; - int tx_count; - unsigned long last_tx; + + /* FIXME: make these adjustable through ethtool */ + int tx_coalesce_frames; + int tx_coalesce_usecs; + struct hrtimer tx_coalesce_timer; unsigned char *rx_buf[RX_DESC_NUM]; dma_addr_t rx_phys[RX_DESC_NUM]; @@ -171,10 +174,15 @@ struct hip04_priv { struct regmap *map; struct work_struct tx_timeout_task; - struct workqueue_struct *wq; - struct delayed_work tx_clean_task; + /* written only by tx cleanup */ + unsigned int tx_tail ____cacheline_aligned_in_smp; }; +static inline unsigned int tx_count(unsigned int head, unsigned int tail) +{ + return (head - tail) % (TX_DESC_NUM - 1); +} + static void hip04_config_port(struct net_device *ndev, u32 speed, u32 duplex) { struct hip04_priv *priv = netdev_priv(ndev); @@ -351,18 +359,21 @@ static int hip04_set_mac_address(struct net_device *ndev, void *addr) return 0; } -static void hip04_tx_reclaim(struct net_device *ndev, bool force) +static int hip04_tx_reclaim(struct net_device *ndev, bool force) { struct hip04_priv *priv = netdev_priv(ndev); unsigned tx_tail = priv->tx_tail; struct tx_desc *desc; unsigned int bytes_compl = 0, pkts_compl = 0; + unsigned int count; - if (priv->tx_count == 0) + smp_rmb(); + count = tx_count(ACCESS_ONCE(priv->tx_head), tx_tail); + if (count == 0) goto out; - while ((tx_tail != priv->tx_head) || (priv->tx_count == TX_DESC_NUM)) { - desc = &priv->tx_desc[priv->tx_tail]; + while (count) { + desc = &priv->tx_desc[tx_tail]; if (desc->send_addr != 0) { if (force) desc->send_addr = 0; @@ -381,57 +392,37 @@ static void hip04_tx_reclaim(struct net_device *ndev, bool force) dev_kfree_skb(priv->tx_skb[tx_tail]); priv->tx_skb[tx_tail] = NULL; tx_tail = TX_NEXT(tx_tail); - priv->tx_count--; - - if (priv->tx_count <= 0) - break; + count--; } priv->tx_tail = tx_tail; + smp_wmb(); /* Ensure tx_tail visible to xmit */ - /* Ensure tx_tail & tx_count visible to xmit */ - smp_mb(); out: - if (pkts_compl || bytes_compl) netdev_completed_queue(ndev, pkts_compl, bytes_compl); - if (unlikely(netif_queue_stopped(ndev)) && - (priv->tx_count < TX_DESC_NUM)) + if (unlikely(netif_queue_stopped(ndev)) && (count < (TX_DESC_NUM - 1))) netif_wake_queue(ndev); -} -static void hip04_tx_clean_monitor(struct work_struct *work) -{ - struct hip04_priv *priv = container_of(work, struct hip04_priv, - tx_clean_task.work); - struct net_device *ndev = priv->ndev; - int delta_in_ticks = msecs_to_jiffies(1000); - - if (!time_in_range(jiffies, priv->last_tx, - priv->last_tx + delta_in_ticks)) { - netif_tx_lock(ndev); - hip04_tx_reclaim(ndev, false); - netif_tx_unlock(ndev); - } - queue_delayed_work(priv->wq, &priv->tx_clean_task, delta_in_ticks); + return count; } static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) { struct hip04_priv *priv = netdev_priv(ndev); struct net_device_stats *stats = &ndev->stats; - unsigned int tx_head = priv->tx_head; + unsigned int tx_head = priv->tx_head, count; struct tx_desc *desc = &priv->tx_desc[tx_head]; dma_addr_t phys; - if (priv->tx_count >= TX_DESC_NUM) { + smp_rmb(); + count = tx_count(tx_head, ACCESS_ONCE(priv->tx_tail)); + if (count == (TX_DESC_NUM - 1)) { netif_stop_queue(ndev); return NETDEV_TX_BUSY; } - hip04_tx_reclaim(ndev, false); - phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE); if (dma_mapping_error(&ndev->dev, phys)) { dev_kfree_skb(skb); @@ -447,20 +438,33 @@ static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev) desc->wb_addr = cpu_to_be32(phys); skb_tx_timestamp(skb); - /* Don't wait up for transmitted skbs to be freed. */ - skb_orphan(skb); - + /* FIXME: eliminate this mmio access if xmit_more is set */ hip04_set_xmit_desc(priv, phys); priv->tx_head = TX_NEXT(tx_head); + count++; netdev_sent_queue(ndev, skb->len); stats->tx_bytes += skb->len; stats->tx_packets++; - priv->tx_count++; - priv->last_tx = jiffies; - /* Ensure tx_head & tx_count update visible to tx reclaim */ - smp_mb(); + /* Ensure tx_head update visible to tx reclaim */ + smp_wmb(); + + /* queue is getting full, better start cleaning up now */ + if (count >= priv->tx_coalesce_frames) { + if (napi_schedule_prep(&priv->napi)) { + /* disable rx interrupt and timer */ + priv->reg_inten &= ~(RCV_INT); + writel_relaxed(DEF_INT_MASK & ~RCV_INT, + priv->base + PPE_INTEN); + hrtimer_cancel(&priv->tx_coalesce_timer); + __napi_schedule(&priv->napi); + } + } else if (!hrtimer_is_queued(&priv->tx_coalesce_timer)) { + /* cleanup not pending yet, start a new timer */ + hrtimer_start_expires(&priv->tx_coalesce_timer, + HRTIMER_MODE_REL); + } return NETDEV_TX_OK; } @@ -477,6 +481,7 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget) bool last = false; dma_addr_t phys; int rx = 0; + int tx_remaining; u16 len; u32 err; @@ -513,11 +518,11 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget) buf = netdev_alloc_frag(priv->rx_buf_size); if (!buf) - return -ENOMEM; + goto done; phys = dma_map_single(&ndev->dev, buf, RX_BUF_SIZE, DMA_FROM_DEVICE); if (dma_mapping_error(&ndev->dev, phys)) - return -EIO; + goto done; priv->rx_buf[priv->rx_head] = buf; priv->rx_phys[priv->rx_head] = phys; hip04_set_recv_desc(priv, phys); @@ -537,6 +542,11 @@ static int hip04_rx_poll(struct napi_struct *napi, int budget) } napi_complete(napi); done: + /* clean up tx descriptors and start a new timer if necessary */ + tx_remaining = hip04_tx_reclaim(ndev, false); + if (rx < budget && tx_remaining) + hrtimer_start_expires(&priv->tx_coalesce_timer, HRTIMER_MODE_REL); + return rx; } @@ -547,6 +557,9 @@ static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id) struct net_device_stats *stats = &ndev->stats; u32 ists = readl_relaxed(priv->base + PPE_INTSTS); + if (!ists) + return IRQ_NONE; + writel_relaxed(DEF_INT_MASK, priv->base + PPE_RINT); if (unlikely(ists & DEF_INT_ERR)) { @@ -560,16 +573,32 @@ static irqreturn_t hip04_mac_interrupt(int irq, void *dev_id) } } - if (ists & RCV_INT) { + if (ists & RCV_INT && napi_schedule_prep(&priv->napi)) { /* disable rx interrupt */ priv->reg_inten &= ~(RCV_INT); - writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN); - napi_schedule(&priv->napi); + writel_relaxed(DEF_INT_MASK & ~RCV_INT, priv->base + PPE_INTEN); + hrtimer_cancel(&priv->tx_coalesce_timer); + __napi_schedule(&priv->napi); } return IRQ_HANDLED; } +enum hrtimer_restart tx_done(struct hrtimer *hrtimer) +{ + struct hip04_priv *priv; + priv = container_of(hrtimer, struct hip04_priv, tx_coalesce_timer); + + if (napi_schedule_prep(&priv->napi)) { + /* disable rx interrupt */ + priv->reg_inten &= ~(RCV_INT); + writel_relaxed(DEF_INT_MASK & ~RCV_INT, priv->base + PPE_INTEN); + __napi_schedule(&priv->napi); + } + + return HRTIMER_NORESTART; +} + static void hip04_adjust_link(struct net_device *ndev) { struct hip04_priv *priv = netdev_priv(ndev); @@ -589,7 +618,6 @@ static int hip04_mac_open(struct net_device *ndev) priv->rx_head = 0; priv->tx_head = 0; priv->tx_tail = 0; - priv->tx_count = 0; hip04_reset_ppe(priv); for (i = 0; i < RX_DESC_NUM; i++) { @@ -612,9 +640,6 @@ static int hip04_mac_open(struct net_device *ndev) hip04_mac_enable(ndev); napi_enable(&priv->napi); - INIT_DELAYED_WORK(&priv->tx_clean_task, hip04_tx_clean_monitor); - queue_delayed_work(priv->wq, &priv->tx_clean_task, 0); - return 0; } @@ -623,8 +648,6 @@ static int hip04_mac_stop(struct net_device *ndev) struct hip04_priv *priv = netdev_priv(ndev); int i; - cancel_delayed_work_sync(&priv->tx_clean_task); - napi_disable(&priv->napi); netif_stop_queue(ndev); hip04_mac_disable(ndev); @@ -725,6 +748,7 @@ static int hip04_mac_probe(struct platform_device *pdev) struct hip04_priv *priv; struct resource *res; unsigned int irq; + ktime_t txtime; int ret; ndev = alloc_etherdev(sizeof(struct hip04_priv)); @@ -751,6 +775,21 @@ static int hip04_mac_probe(struct platform_device *pdev) priv->port = arg.args[0]; priv->chan = arg.args[1] * RX_DESC_NUM; + hrtimer_init(&priv->tx_coalesce_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + + /* + * BQL will try to keep the TX queue as short as possible, but it can't + * be faster than tx_coalesce_usecs, so we need a fast timeout here, + * but also long enough to gather up enough frames to ensure we don't + * get more interrupts than necessary. + * 200us is enough for 16 frames of 1500 bytes at gigabit ethernet rate + */ + priv->tx_coalesce_frames = TX_DESC_NUM * 3 / 4; + priv->tx_coalesce_usecs = 200; + /* allow timer to fire after half the time at the earliest */ + txtime = ktime_set(0, priv->tx_coalesce_usecs * NSEC_PER_USEC / 2); + hrtimer_set_expires_range(&priv->tx_coalesce_timer, txtime, txtime); + priv->map = syscon_node_to_regmap(arg.np); if (IS_ERR(priv->map)) { dev_warn(d, "no syscon hisilicon,hip04-ppe\n"); @@ -788,12 +827,6 @@ static int hip04_mac_probe(struct platform_device *pdev) } } - priv->wq = create_singlethread_workqueue(ndev->name); - if (!priv->wq) { - ret = -ENOMEM; - goto init_fail; - } - INIT_WORK(&priv->tx_timeout_task, hip04_tx_timeout_task); ether_setup(ndev); @@ -848,8 +881,6 @@ static int hip04_remove(struct platform_device *pdev) free_irq(ndev->irq, ndev); of_node_put(priv->phy_node); cancel_work_sync(&priv->tx_timeout_task); - if (priv->wq) - destroy_workqueue(priv->wq); free_netdev(ndev); return 0; -- To unsubscribe from this list: send the line "unsubscribe devicetree" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html