On Mon, 12 Mar 2012 14:44:27 +0200 <horia.geanta@xxxxxxxxxxxxx> wrote: > This patch replaces the back-half implementation of talitos crypto engine from > tasklet to NAPI. The decision to do this was based on improved performance > (around 7%). > A similiar patch (not posted yet) was tested for caam crypto engine, with > 10-15% improvement over tasklet. > > Since having crypto engines use the net softirq is probably not acceptable, > I would like to hear your comments on what options do I have to make this > upstreamable. > Besides current approach, I am considering the following: > - defining a new softirq for crypto engines, having a higher priority than the > NET_RX_SOFTIRQ but the priority just has to be equal to NET_RX_SOFTIRQ to get the net--crypto performance balance, not necessarily higher IIRC. > - using tasklet_hi_schedule instead of tasklet_schedule this has done close to nothing for performance in my experience - but there is no other tasklet competing for the slice in my IPSec fwding tests either. > Let me know if any of these two fits better or if something else is preferred. Herbert/Dave, Is it ok for a crypto driver to depend on NET? If not, how should the NAPI-style flow be abstracted out of NET? Thanks, Kim > Thank you > > From 20f30ef6fdfe641f1c30f94320891715ffee33a2 Mon Sep 17 00:00:00 2001 > From: Sandeep Malik <Sandeep.Malik@xxxxxxxxxxxxx> > Date: Sat, 12 Jun 2010 14:08:47 +0800 > Subject: [RFC,PATCH] crypto: talitos - Replace the tasklet implementation with NAPI > > This patch updates the current tasklet implement to NAPI so as > the system is more balanced in the terms that the packet submission > and the packet forwarding after being processed can be done at > the same priority. > > Signed-off-by: Sandeep Malik <Sandeep.Malik@xxxxxxxxxxxxx> > Signed-off-by: Horia Geanta <horia.geanta@xxxxxxxxxxxxx> > --- > drivers/crypto/Kconfig | 2 +- > drivers/crypto/talitos.c | 145 +++++++++++++++++++++++++++++++++------------- > drivers/crypto/talitos.h | 4 +- > 3 files changed, 109 insertions(+), 42 deletions(-) > > diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig > index e707979..682096b 100644 > --- a/drivers/crypto/Kconfig > +++ b/drivers/crypto/Kconfig > @@ -218,7 +218,7 @@ config CRYPTO_DEV_TALITOS > select CRYPTO_ALGAPI > select CRYPTO_AUTHENC > select HW_RANDOM > - depends on FSL_SOC > + depends on FSL_SOC && NET > help > Say 'Y' here to use the Freescale Security Engine (SEC) > to offload cryptographic algorithm computation. > diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c > index dc641c7..f368579 100644 > --- a/drivers/crypto/talitos.c > +++ b/drivers/crypto/talitos.c > @@ -1,7 +1,7 @@ > /* > * talitos - Freescale Integrated Security Engine (SEC) device driver > * > - * Copyright (c) 2008-2011 Freescale Semiconductor, Inc. > + * Copyright (c) 2008-2012 Freescale Semiconductor, Inc. > * > * Scatterlist Crypto API glue code copied from files with the following: > * Copyright (c) 2006-2007 Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx> > @@ -37,6 +37,7 @@ > #include <linux/io.h> > #include <linux/spinlock.h> > #include <linux/rtnetlink.h> > +#include <linux/netdevice.h> > #include <linux/slab.h> > > #include <crypto/algapi.h> > @@ -121,6 +122,7 @@ struct talitos_channel { > struct talitos_private { > struct device *dev; > struct platform_device *ofdev; > + struct net_device __percpu *netdev; > void __iomem *reg; > int irq[2]; > > @@ -145,8 +147,8 @@ struct talitos_private { > /* next channel to be assigned next incoming descriptor */ > atomic_t last_chan ____cacheline_aligned; > > - /* request callback tasklet */ > - struct tasklet_struct done_task[2]; > + /* request callback napi */ > + struct napi_struct __percpu *done_task[2]; > > /* list of registered algorithms */ > struct list_head alg_list; > @@ -349,17 +351,18 @@ static int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc, > /* > * process what was done, notify callback of error if not > */ > -static void flush_channel(struct device *dev, int ch, int error, int reset_ch) > +static int flush_channel(struct device *dev, int ch, int error, int reset_ch, > + int weight) > { > struct talitos_private *priv = dev_get_drvdata(dev); > struct talitos_request *request, saved_req; > unsigned long flags; > - int tail, status; > + int tail, status, count = 0; > > spin_lock_irqsave(&priv->chan[ch].tail_lock, flags); > > tail = priv->chan[ch].tail; > - while (priv->chan[ch].fifo[tail].desc) { > + while (priv->chan[ch].fifo[tail].desc && (count < weight)) { > request = &priv->chan[ch].fifo[tail]; > > /* descriptors with their done bits set don't get the error */ > @@ -396,43 +399,55 @@ static void flush_channel(struct device *dev, int ch, int error, int reset_ch) > status); > /* channel may resume processing in single desc error case */ > if (error && !reset_ch && status == error) > - return; > + return 0; > spin_lock_irqsave(&priv->chan[ch].tail_lock, flags); > tail = priv->chan[ch].tail; > + count++; > } > > spin_unlock_irqrestore(&priv->chan[ch].tail_lock, flags); > + > + return count; > } > > /* > * process completed requests for channels that have done status > */ > -#define DEF_TALITOS_DONE(name, ch_done_mask) \ > -static void talitos_done_##name(unsigned long data) \ > +#define DEF_TALITOS_DONE(name, ch_done_mask, num_ch) \ > +static int talitos_done_##name(struct napi_struct *napi, int budget) \ > { \ > - struct device *dev = (struct device *)data; \ > + struct device *dev = &napi->dev->dev; \ > struct talitos_private *priv = dev_get_drvdata(dev); \ > + int budget_per_ch, work_done = 0; \ > \ > + budget_per_ch = budget / num_ch; \ > if (ch_done_mask & 1) \ > - flush_channel(dev, 0, 0, 0); \ > + work_done += flush_channel(dev, 0, 0, 0, budget_per_ch);\ > if (priv->num_channels == 1) \ > goto out; \ > if (ch_done_mask & (1 << 2)) \ > - flush_channel(dev, 1, 0, 0); \ > + work_done += flush_channel(dev, 1, 0, 0, budget_per_ch);\ > if (ch_done_mask & (1 << 4)) \ > - flush_channel(dev, 2, 0, 0); \ > + work_done += flush_channel(dev, 2, 0, 0, budget_per_ch);\ > if (ch_done_mask & (1 << 6)) \ > - flush_channel(dev, 3, 0, 0); \ > + work_done += flush_channel(dev, 3, 0, 0, budget_per_ch);\ > \ > out: \ > - /* At this point, all completed channels have been processed */ \ > - /* Unmask done interrupts for channels completed later on. */ \ > - setbits32(priv->reg + TALITOS_IMR, ch_done_mask); \ > - setbits32(priv->reg + TALITOS_IMR_LO, TALITOS_IMR_LO_INIT); \ > + if (work_done < budget) { \ > + napi_complete(napi); \ > + /* At this point, all completed channels have been */ \ > + /* processed. Unmask done interrupts for channels */ \ > + /* completed later on. */ \ > + setbits32(priv->reg + TALITOS_IMR, ch_done_mask); \ > + setbits32(priv->reg + TALITOS_IMR_LO, \ > + TALITOS_IMR_LO_INIT); \ > + } \ > + \ > + return work_done; \ > } > -DEF_TALITOS_DONE(4ch, TALITOS_ISR_4CHDONE) > -DEF_TALITOS_DONE(ch0_2, TALITOS_ISR_CH_0_2_DONE) > -DEF_TALITOS_DONE(ch1_3, TALITOS_ISR_CH_1_3_DONE) > +DEF_TALITOS_DONE(4ch, TALITOS_ISR_4CHDONE, 4) > +DEF_TALITOS_DONE(ch0_2, TALITOS_ISR_CH_0_2_DONE, 2) > +DEF_TALITOS_DONE(ch1_3, TALITOS_ISR_CH_1_3_DONE, 2) > > /* > * locate current (offending) descriptor > @@ -582,7 +597,7 @@ static void talitos_error(struct device *dev, u32 isr, u32 isr_lo) > if (v_lo & TALITOS_CCPSR_LO_SRL) > dev_err(dev, "scatter return/length error\n"); > > - flush_channel(dev, ch, error, reset_ch); > + flush_channel(dev, ch, error, reset_ch, priv->fifo_len); > > if (reset_ch) { > reset_channel(dev, ch); > @@ -606,14 +621,14 @@ static void talitos_error(struct device *dev, u32 isr, u32 isr_lo) > > /* purge request queues */ > for (ch = 0; ch < priv->num_channels; ch++) > - flush_channel(dev, ch, -EIO, 1); > + flush_channel(dev, ch, -EIO, 1, priv->fifo_len); > > /* reset and reinitialize the device */ > init_device(dev); > } > } > > -#define DEF_TALITOS_INTERRUPT(name, ch_done_mask, ch_err_mask, tlet) \ > +#define DEF_TALITOS_INTERRUPT(name, ch_done_mask, ch_err_mask, sirq) \ > static irqreturn_t talitos_interrupt_##name(int irq, void *data) \ > { \ > struct device *dev = data; \ > @@ -633,7 +648,8 @@ static irqreturn_t talitos_interrupt_##name(int irq, void *data) \ > /* mask further done interrupts. */ \ > clrbits32(priv->reg + TALITOS_IMR, ch_done_mask); \ > /* done_task will unmask done interrupts at exit */ \ > - tasklet_schedule(&priv->done_task[tlet]); \ > + napi_schedule(per_cpu_ptr(priv->done_task[sirq], \ > + smp_processor_id())); \ > } \ > \ > return (isr & (ch_done_mask | ch_err_mask) || isr_lo) ? IRQ_HANDLED : \ > @@ -2555,7 +2571,7 @@ static int talitos_remove(struct platform_device *ofdev) > struct device *dev = &ofdev->dev; > struct talitos_private *priv = dev_get_drvdata(dev); > struct talitos_crypto_alg *t_alg, *n; > - int i; > + int i, j; > > list_for_each_entry_safe(t_alg, n, &priv->alg_list, entry) { > switch (t_alg->algt.type) { > @@ -2574,25 +2590,32 @@ static int talitos_remove(struct platform_device *ofdev) > if (hw_supports(dev, DESC_HDR_SEL0_RNG)) > talitos_unregister_rng(dev); > > - for (i = 0; i < priv->num_channels; i++) > - kfree(priv->chan[i].fifo); > - > - kfree(priv->chan); > - > for (i = 0; i < 2; i++) > if (priv->irq[i]) { > free_irq(priv->irq[i], dev); > irq_dispose_mapping(priv->irq[i]); > + > + for_each_possible_cpu(j) { > + napi_disable(per_cpu_ptr(priv->done_task[i], > + j)); > + netif_napi_del(per_cpu_ptr(priv->done_task[i], > + j)); > + } > + > + free_percpu(priv->done_task[i]); > } > > - tasklet_kill(&priv->done_task[0]); > - if (priv->irq[1]) > - tasklet_kill(&priv->done_task[1]); > + for (i = 0; i < priv->num_channels; i++) > + kfree(priv->chan[i].fifo); > + > + kfree(priv->chan); > > iounmap(priv->reg); > > dev_set_drvdata(dev, NULL); > > + free_percpu(priv->netdev); > + > kfree(priv); > > return 0; > @@ -2718,19 +2741,61 @@ static int talitos_probe(struct platform_device *ofdev) > dev_set_drvdata(dev, priv); > > priv->ofdev = ofdev; > + priv->dev = dev; > + > + priv->netdev = alloc_percpu(struct net_device); > + if (!priv->netdev) { > + dev_err(dev, "failed to allocate netdevice\n"); > + err = -ENOMEM; > + goto err_out; > + } > + > + for_each_possible_cpu(i) { > + err = init_dummy_netdev(per_cpu_ptr(priv->netdev, i)); > + if (err) { > + dev_err(dev, "failed to initialize dummy netdevice\n"); > + goto err_out; > + } > + (per_cpu_ptr(priv->netdev, i))->dev = *dev; > + } > > err = talitos_probe_irq(ofdev); > if (err) > goto err_out; > > + priv->done_task[0] = alloc_percpu(struct napi_struct); > + if (!priv->done_task[0]) { > + dev_err(dev, "failed to allocate napi for 1st irq\n"); > + err = -ENOMEM; > + goto err_out; > + } > + > if (!priv->irq[1]) { > - tasklet_init(&priv->done_task[0], talitos_done_4ch, > - (unsigned long)dev); > + for_each_possible_cpu(i) { > + netif_napi_add(per_cpu_ptr(priv->netdev, i), > + per_cpu_ptr(priv->done_task[0], i), > + talitos_done_4ch, TALITOS_NAPI_WEIGHT); > + napi_enable(per_cpu_ptr(priv->done_task[0], i)); > + } > } else { > - tasklet_init(&priv->done_task[0], talitos_done_ch0_2, > - (unsigned long)dev); > - tasklet_init(&priv->done_task[1], talitos_done_ch1_3, > - (unsigned long)dev); > + priv->done_task[1] = alloc_percpu(struct napi_struct); > + if (!priv->done_task[1]) { > + dev_err(dev, "failed to allocate napi for 2nd irq\n"); > + err = -ENOMEM; > + goto err_out; > + } > + > + for_each_possible_cpu(i) { > + netif_napi_add(per_cpu_ptr(priv->netdev, i), > + per_cpu_ptr(priv->done_task[0], i), > + talitos_done_ch0_2, TALITOS_NAPI_WEIGHT); > + napi_enable(per_cpu_ptr(priv->done_task[0], i)); > + > + netif_napi_add(per_cpu_ptr(priv->netdev, i), > + per_cpu_ptr(priv->done_task[1], i), > + talitos_done_ch1_3, TALITOS_NAPI_WEIGHT); > + napi_enable(per_cpu_ptr(priv->done_task[1], i)); > + } > } > > INIT_LIST_HEAD(&priv->alg_list); > diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h > index 3c17395..ba62abc 100644 > --- a/drivers/crypto/talitos.h > +++ b/drivers/crypto/talitos.h > @@ -1,7 +1,7 @@ > /* > * Freescale SEC (talitos) device register and descriptor header defines > * > - * Copyright (c) 2006-2011 Freescale Semiconductor, Inc. > + * Copyright (c) 2006-2012 Freescale Semiconductor, Inc. > * > * Redistribution and use in source and binary forms, with or without > * modification, are permitted provided that the following conditions > @@ -28,6 +28,8 @@ > * > */ > > +#define TALITOS_NAPI_WEIGHT 12 > + > /* > * TALITOS_xxx_LO addresses point to the low data bits (32-63) of the register > */ > -- > 1.7.3.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-crypto" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html