Re: [PATCH net-next v3 4/4] net: ocelot: add FDMA support

Vladimir Oltean <vladimir.oltean@xxxxxxx> · Sat, 27 Nov 2021 14:58:06 +0000

On Fri, Nov 26, 2021 at 06:27:39PM +0100, Clément Léger wrote:
> Ethernet frames can be extracted or injected autonomously to or from
> the device’s DDR3/DDR3L memory and/or PCIe memory space. Linked list
> data structures in memory are used for injecting or extracting Ethernet
> frames. The FDMA generates interrupts when frame extraction or
> injection is done and when the linked lists need updating.
> 
> The FDMA is shared between all the ethernet ports of the switch and
> uses a linked list of descriptors (DCB) to inject and extract packets.
> Before adding descriptors, the FDMA channels must be stopped. It would
> be inefficient to do that each time a descriptor would be added so the
> channels are restarted only once they stopped.
> 
> Both channels uses ring-like structure to feed the DCBs to the FDMA.
> head and tail are never touched by hardware and are completely handled
> by the driver.
> 
> Co-developed-by: Alexandre Belloni <alexandre.belloni@xxxxxxxxxxx>
> Signed-off-by: Alexandre Belloni <alexandre.belloni@xxxxxxxxxxx>
> Signed-off-by: Clément Léger <clement.leger@xxxxxxxxxxx>
> ---

I need to ask, was there any change in performance, in one direction or
the other, between the ring and list based implementations?

>  drivers/net/ethernet/mscc/Makefile         |   1 +
>  drivers/net/ethernet/mscc/ocelot.c         |  43 +-
>  drivers/net/ethernet/mscc/ocelot.h         |   1 +
>  drivers/net/ethernet/mscc/ocelot_fdma.c    | 713 +++++++++++++++++++++
>  drivers/net/ethernet/mscc/ocelot_fdma.h    |  96 +++
>  drivers/net/ethernet/mscc/ocelot_net.c     |  18 +-
>  drivers/net/ethernet/mscc/ocelot_vsc7514.c |  13 +
>  include/soc/mscc/ocelot.h                  |   4 +
>  8 files changed, 869 insertions(+), 20 deletions(-)
>  create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.c
>  create mode 100644 drivers/net/ethernet/mscc/ocelot_fdma.h
> 
> diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile
> index 722c27694b21..d76a9b78b6ca 100644
> --- a/drivers/net/ethernet/mscc/Makefile
> +++ b/drivers/net/ethernet/mscc/Makefile
> @@ -11,5 +11,6 @@ mscc_ocelot_switch_lib-y := \
>  mscc_ocelot_switch_lib-$(CONFIG_BRIDGE_MRP) += ocelot_mrp.o
>  obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot.o
>  mscc_ocelot-y := \
> +	ocelot_fdma.o \
>  	ocelot_vsc7514.o \
>  	ocelot_net.o
> diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
> index 1f7c9ff18ac5..4b2460d232c2 100644
> --- a/drivers/net/ethernet/mscc/ocelot.c
> +++ b/drivers/net/ethernet/mscc/ocelot.c
> @@ -966,14 +966,37 @@ static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh)
>  	return 0;
>  }
>  
> -int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
> +void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
> +			     u64 timestamp)
>  {
>  	struct skb_shared_hwtstamps *shhwtstamps;
>  	u64 tod_in_ns, full_ts_in_ns;
> +	struct timespec64 ts;
> +
> +	if (!ocelot->ptp)
> +		return;
> +
> +	ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
> +
> +	tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
> +	if ((tod_in_ns & 0xffffffff) < timestamp)
> +		full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
> +				timestamp;
> +	else
> +		full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
> +				timestamp;
> +
> +	shhwtstamps = skb_hwtstamps(skb);
> +	memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
> +	shhwtstamps->hwtstamp = full_ts_in_ns;
> +}
> +EXPORT_SYMBOL(ocelot_ptp_rx_timestamp);

This split can very well be a separate patch, it's distracting.

> +
> +int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
> +{
>  	u64 timestamp, src_port, len;
>  	u32 xfh[OCELOT_TAG_LEN / 4];
>  	struct net_device *dev;
> -	struct timespec64 ts;
>  	struct sk_buff *skb;
>  	int sz, buf_len;
>  	u32 val, *buf;
> @@ -1029,21 +1052,7 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
>  		*buf = val;
>  	}
>  
> -	if (ocelot->ptp) {
> -		ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
> -
> -		tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
> -		if ((tod_in_ns & 0xffffffff) < timestamp)
> -			full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
> -					timestamp;
> -		else
> -			full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
> -					timestamp;
> -
> -		shhwtstamps = skb_hwtstamps(skb);
> -		memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
> -		shhwtstamps->hwtstamp = full_ts_in_ns;
> -	}
> +	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);
>  
>  	/* Everything we see on an interface that is in the HW bridge
>  	 * has already been forwarded.
> diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
> index e43da09b8f91..f1a7b403e221 100644
> --- a/drivers/net/ethernet/mscc/ocelot.h
> +++ b/drivers/net/ethernet/mscc/ocelot.h
> @@ -9,6 +9,7 @@
>  #define _MSCC_OCELOT_H_
>  
>  #include <linux/bitops.h>
> +#include <linux/dsa/ocelot.h>
>  #include <linux/etherdevice.h>
>  #include <linux/if_vlan.h>
>  #include <linux/net_tstamp.h>
> diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c
> new file mode 100644
> index 000000000000..e42c2c3ad273
> --- /dev/null
> +++ b/drivers/net/ethernet/mscc/ocelot_fdma.c
> @@ -0,0 +1,713 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
> +/*
> + * Microsemi SoCs FDMA driver
> + *
> + * Copyright (c) 2021 Microchip
> + */
> +
> +#include <linux/bitops.h>
> +#include <linux/dmapool.h>
> +#include <linux/dsa/ocelot.h>
> +#include <linux/netdevice.h>
> +#include <linux/of_platform.h>
> +#include <linux/skbuff.h>
> +
> +#include "ocelot_fdma.h"
> +#include "ocelot_qs.h"
> +
> +#define MSCC_FDMA_DCB_LLP(x)			((x) * 4 + 0x0)
> +#define MSCC_FDMA_DCB_LLP_PREV(x)		((x) * 4 + 0xA0)
> +
> +#define MSCC_FDMA_DCB_STAT_BLOCKO(x)		(((x) << 20) & GENMASK(31, 20))
> +#define MSCC_FDMA_DCB_STAT_BLOCKO_M		GENMASK(31, 20)
> +#define MSCC_FDMA_DCB_STAT_BLOCKO_X(x)		(((x) & GENMASK(31, 20)) >> 20)
> +#define MSCC_FDMA_DCB_STAT_PD			BIT(19)
> +#define MSCC_FDMA_DCB_STAT_ABORT		BIT(18)
> +#define MSCC_FDMA_DCB_STAT_EOF			BIT(17)
> +#define MSCC_FDMA_DCB_STAT_SOF			BIT(16)
> +#define MSCC_FDMA_DCB_STAT_BLOCKL_M		GENMASK(15, 0)
> +#define MSCC_FDMA_DCB_STAT_BLOCKL(x)		((x) & GENMASK(15, 0))
> +
> +#define MSCC_FDMA_CH_SAFE			0xcc
> +
> +#define MSCC_FDMA_CH_ACTIVATE			0xd0
> +
> +#define MSCC_FDMA_CH_DISABLE			0xd4
> +
> +#define MSCC_FDMA_EVT_ERR			0x164
> +
> +#define MSCC_FDMA_EVT_ERR_CODE			0x168
> +
> +#define MSCC_FDMA_INTR_LLP			0x16c
> +
> +#define MSCC_FDMA_INTR_LLP_ENA			0x170
> +
> +#define MSCC_FDMA_INTR_FRM			0x174
> +
> +#define MSCC_FDMA_INTR_FRM_ENA			0x178
> +
> +#define MSCC_FDMA_INTR_ENA			0x184
> +
> +#define MSCC_FDMA_INTR_IDENT			0x188
> +
> +#define MSCC_FDMA_INJ_CHAN			2
> +#define MSCC_FDMA_XTR_CHAN			0
> +
> +#define OCELOT_FDMA_RX_MTU			ETH_DATA_LEN
> +#define OCELOT_FDMA_WEIGHT			32

I guess you've reduced to half of NAPI_POLL_WEIGHT because the NET_RX
softirq is consuming too much CPU time with the default value? I don't
know if this is the productive thing to do with a very slow CPU that is
swamped with traffic, since you're practically leaving yourself exposed
to more interrupts, can somebody please chime in?

> +#define OCELOT_FDMA_RX_REFILL_COUNT		(OCELOT_FDMA_MAX_DCB / 2)

Unused. I suppose you wanted to refill more than once per NAPI poll
cycle (as you currently do in ocelot_fdma_rx_restart) but you didn't get
around to it? I think you should still do that, don't leave the RX ring
running dry.

> +
> +#define OCELOT_FDMA_CH_SAFE_TIMEOUT_MS		100
> +
> +#define OCELOT_FDMA_RX_EXTRA_SIZE \
> +				(OCELOT_TAG_LEN + ETH_FCS_LEN + ETH_HLEN)

Could all these macros belong to ocelot_fdma.h?

> +
> +static int ocelot_fdma_rx_buf_size(int mtu)
> +{
> +	return ALIGN(mtu + OCELOT_FDMA_RX_EXTRA_SIZE, 4);
> +}
> +
> +static void ocelot_fdma_writel(struct ocelot_fdma *fdma, u32 reg, u32 data)
> +{
> +	writel(data, fdma->base + reg);
> +}

Regmap is too slow for you, you're using direct I/O accessors now?

> +
> +static u32 ocelot_fdma_readl(struct ocelot_fdma *fdma, u32 reg)
> +{
> +	return readl(fdma->base + reg);
> +}
> +
> +static unsigned int ocelot_fdma_idx_incr(unsigned int idx)

Minor comment, but "inc" and "dec" are much more popular abbreviations.
Although the way in which you use them is not quite the same way in
which other drivers use them (something called "inc" would take a
reference on the number and actually increment it). So maybe "next" and
"prev"?

> +{
> +	idx++;
> +	if (idx == OCELOT_FDMA_MAX_DCB)
> +		idx = 0;
> +
> +	return idx;
> +}
> +
> +static unsigned int ocelot_fdma_idx_decr(unsigned int idx)
> +{
> +	if (idx == 0)
> +		idx = OCELOT_FDMA_MAX_DCB - 1;
> +	else
> +		idx--;
> +
> +	return idx;
> +}
> +
> +static int ocelot_fdma_tx_free_count(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->inj;
> +
> +	if (ring->tail >= ring->head)
> +		return OCELOT_FDMA_MAX_DCB - (ring->tail - ring->head) - 1;
> +	else
> +		return ring->head - ring->tail - 1;
> +}
> +
> +static bool ocelot_fdma_ring_empty(struct ocelot_fdma_ring *ring)
> +{
> +	return ring->head == ring->tail;
> +}
> +
> +static void ocelot_fdma_activate_chan(struct ocelot_fdma *fdma,
> +				      struct ocelot_fdma_dcb *dcb, int chan)
> +{
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_DCB_LLP(chan), dcb->hw_dma);
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_ACTIVATE, BIT(chan));
> +}
> +
> +static int ocelot_fdma_wait_chan_safe(struct ocelot_fdma *fdma, int chan)
> +{
> +	unsigned long timeout;
> +	u32 safe;
> +
> +	timeout = jiffies + msecs_to_jiffies(OCELOT_FDMA_CH_SAFE_TIMEOUT_MS);
> +	do {
> +		safe = ocelot_fdma_readl(fdma, MSCC_FDMA_CH_SAFE);
> +		if (safe & BIT(chan))
> +			return 0;

Pretty busy loop, and your timeout is 100 ms. Kinda nasty stuff for the
latency of your system.

> +	} while (time_after(jiffies, timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
> +static int ocelot_fdma_stop_channel(struct ocelot_fdma *fdma, int chan)
> +{
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_CH_DISABLE, BIT(chan));
> +
> +	return ocelot_fdma_wait_chan_safe(fdma, chan);

Maybe for the extraction channel it would make sense to have an async
stop, meaning that you ask it to stop, then process the frames you've
got so far, and wait until the channel has completely stopped only when
you need to refill?

> +}
> +
> +static bool ocelot_fdma_dcb_set_data(struct ocelot_fdma *fdma,
> +				     struct ocelot_fdma_dcb *dcb,
> +				     struct sk_buff *skb,
> +				     size_t size, enum dma_data_direction dir)
> +{
> +	struct ocelot_fdma_dcb_hw_v2 *hw = dcb->hw;
> +	u32 offset;
> +
> +	dcb->skb = skb;
> +	dcb->mapped_size = size;
> +	dcb->mapping = dma_map_single(fdma->dev, skb->data, size, dir);
> +	if (unlikely(dma_mapping_error(fdma->dev, dcb->mapping)))
> +		return false;
> +
> +	offset = dcb->mapping & 0x3;
> +
> +	hw->llp = 0;
> +	hw->datap = ALIGN_DOWN(dcb->mapping, 4);
> +	hw->datal = ALIGN_DOWN(size, 4);
> +	hw->stat = MSCC_FDMA_DCB_STAT_BLOCKO(offset);
> +
> +	return true;
> +}
> +
> +static bool ocelot_fdma_rx_set_skb(struct ocelot_fdma *fdma,
> +				   struct ocelot_fdma_dcb *dcb,
> +				   struct sk_buff *skb, size_t size)
> +{
> +	return ocelot_fdma_dcb_set_data(fdma, dcb, skb, size,
> +					DMA_FROM_DEVICE);
> +}
> +
> +static bool ocelot_fdma_tx_dcb_set_skb(struct ocelot_fdma *fdma,
> +				       struct ocelot_fdma_dcb *dcb,
> +				       struct sk_buff *skb)
> +{
> +	if (!ocelot_fdma_dcb_set_data(fdma, dcb, skb, skb->len,
> +				      DMA_TO_DEVICE))
> +		return false;
> +
> +	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_BLOCKL(skb->len);
> +	dcb->hw->stat |= MSCC_FDMA_DCB_STAT_SOF | MSCC_FDMA_DCB_STAT_EOF;
> +
> +	return true;
> +}
> +
> +static void ocelot_fdma_rx_restart(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct ocelot_fdma_dcb *dcb, *last_dcb;
> +	unsigned int idx;
> +	int ret;
> +	u32 llp;
> +
> +	/* Check if the FDMA hits the DCB with LLP == NULL */
> +	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_DCB_LLP(MSCC_FDMA_XTR_CHAN));
> +	if (llp)
> +		return;

I'm not sure why you're letting the hardware grind to a halt first,
before refilling? I think since the CPU is the bottleneck anyway, you
can stop the extraction channel at any time you want to refill.
A constant stream of less data might be better than a bursty one.
Or maybe I'm misunderstanding some of the details of the hardware.

> +
> +	ret = ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
> +	if (ret) {
> +		dev_warn(fdma->dev, "Unable to stop RX channel\n");

Rate limit these prints maybe.

> +		return;
> +	}
> +
> +	/* Chain the tail with the next DCB */
> +	dcb = &ring->dcbs[ring->tail];
> +	idx = ocelot_fdma_idx_incr(ring->tail);
> +	dcb->hw->llp = ring->dcbs[idx].hw_dma;
> +	dcb = &ring->dcbs[idx];
> +
> +	/* Place a NULL terminator in last DCB added (head - 1) */
> +	idx = ocelot_fdma_idx_decr(ring->head);
> +	last_dcb = &ring->dcbs[idx];
> +	last_dcb->hw->llp = 0;
> +	ring->tail = idx;
> +
> +	/* Finally reactivate the channel */
> +	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_XTR_CHAN);
> +}
> +
> +static bool ocelot_fdma_rx_get(struct ocelot_fdma *fdma, int budget)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct ocelot_fdma_dcb *dcb, *next_dcb;
> +	struct ocelot *ocelot = fdma->ocelot;
> +	struct net_device *ndev;
> +	struct sk_buff *skb;
> +	bool valid = true;
> +	u64 timestamp;
> +	u64 src_port;
> +	void *xfh;
> +	u32 stat;
> +
> +	/* We should not go past the tail */
> +	if (ring->head == ring->tail)
> +		return false;
> +
> +	dcb = &ring->dcbs[ring->head];
> +	stat = dcb->hw->stat;
> +	if (MSCC_FDMA_DCB_STAT_BLOCKL(stat) == 0)
> +		return false;
> +
> +	ring->head = ocelot_fdma_idx_incr(ring->head);
> +
> +	if (stat & MSCC_FDMA_DCB_STAT_ABORT || stat & MSCC_FDMA_DCB_STAT_PD)
> +		valid = false;
> +
> +	if (!(stat & MSCC_FDMA_DCB_STAT_SOF) ||
> +	    !(stat & MSCC_FDMA_DCB_STAT_EOF))
> +		valid = false;
> +
> +	dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
> +			 DMA_FROM_DEVICE);
> +
> +	skb = dcb->skb;
> +
> +	if (unlikely(!valid)) {
> +		dev_warn(fdma->dev, "Invalid packet\n");

Rate limit please, and try to show something which might be relevant to
why it is not valid.

> +		goto refill;
> +	}
> +
> +	xfh = skb->data;
> +	ocelot_xfh_get_src_port(xfh, &src_port);
> +
> +	if (WARN_ON(src_port >= ocelot->num_phys_ports))
> +		goto refill;
> +
> +	ndev = ocelot_port_to_netdev(ocelot, src_port);
> +	if (unlikely(!ndev))
> +		goto refill;
> +
> +	skb_put(skb, MSCC_FDMA_DCB_STAT_BLOCKL(stat) - ETH_FCS_LEN);
> +	skb_pull(skb, OCELOT_TAG_LEN);
> +
> +	skb->dev = ndev;
> +	skb->protocol = eth_type_trans(skb, skb->dev);
> +	skb->dev->stats.rx_bytes += skb->len;
> +	skb->dev->stats.rx_packets++;
> +
> +	ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);

You forgot to extract the "timestamp" from the XFH, and are providing
junk from the kernel stack memory. Please make sure to test PTP.

> +
> +	if (!skb_defer_rx_timestamp(skb))
> +		netif_receive_skb(skb);
> +
> +	skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);
> +	if (!skb)
> +		return false;

See my comment below, on the ocelot_fdma_rx_skb_alloc() function, on why
I think you are making sub-optimal use of the ring concept.

> +
> +refill:
> +	if (!ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size))
> +		return false;
> +
> +	/* Chain the next DCB */
> +	next_dcb = &ring->dcbs[ring->head];
> +	dcb->hw->llp = next_dcb->hw_dma;
> +
> +	return true;
> +}
> +
> +static void ocelot_fdma_tx_cleanup(struct ocelot_fdma *fdma, int budget)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->inj;
> +	unsigned int tmp_head, new_null_llp_idx;
> +	struct ocelot_fdma_dcb *dcb;
> +	bool end_of_list = false;
> +	int ret;
> +
> +	spin_lock_bh(&fdma->xmit_lock);
> +
> +	/* Purge the TX packets that have been sent up to the NULL llp or the
> +	 * end of done list.
> +	 */
> +	while (!ocelot_fdma_ring_empty(&fdma->inj)) {

s/&fdma->inj/ring/

> +		dcb = &ring->dcbs[ring->head];
> +		if (!(dcb->hw->stat & MSCC_FDMA_DCB_STAT_PD))
> +			break;
> +
> +		tmp_head = ring->head;

Unused.

> +		ring->head = ocelot_fdma_idx_incr(ring->head);
> +
> +		dma_unmap_single(fdma->dev, dcb->mapping, dcb->mapped_size,
> +				 DMA_TO_DEVICE);
> +		napi_consume_skb(dcb->skb, budget);
> +
> +		/* If we hit the NULL LLP, stop, we might need to reload FDMA */
> +		if (dcb->hw->llp == 0) {
> +			end_of_list = true;
> +			break;
> +		}
> +	}
> +
> +	/* If there is still some DCBs to be processed by the FDMA or if the
> +	 * pending list is empty, there is no need to restart the FDMA.
> +	 */

I don't understand why you restart the injection channel from the TX
confirmation interrupt. It raised the interrupt to tell you that it hit
a NULL LLP because there's nothing left to send. If you restart it now and
no other transmission has happened in the meantime, won't it stop again?

> +	if (!end_of_list || ocelot_fdma_ring_empty(&fdma->inj))

s/&fdma->inj/ring/

> +		goto out_unlock;
> +
> +	ret = ocelot_fdma_wait_chan_safe(fdma, MSCC_FDMA_INJ_CHAN);
> +	if (ret) {
> +		dev_warn(fdma->dev, "Failed to wait for TX channel to stop\n");
> +		goto out_unlock;
> +	}
> +
> +	/* Set NULL LLP */
> +	new_null_llp_idx = ocelot_fdma_idx_decr(ring->tail);
> +	dcb = &ring->dcbs[new_null_llp_idx];
> +	dcb->hw->llp = 0;
> +
> +	dcb = &ring->dcbs[ring->head];
> +	ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);

	if (unlikely(netif_queue_stopped(dev)) &&
	    ocelot_fdma_tx_free_count(fdma))
		netif_wake_queue(dev);

This can then be tweaked for when you add support for scatter/gather xmit.

> +
> +out_unlock:
> +	spin_unlock_bh(&fdma->xmit_lock);
> +}
> +
> +static int ocelot_fdma_napi_poll(struct napi_struct *napi, int budget)
> +{
> +	struct ocelot_fdma *fdma = container_of(napi, struct ocelot_fdma, napi);
> +	int work_done = 0;
> +
> +	ocelot_fdma_tx_cleanup(fdma, budget);
> +
> +	while (work_done < budget) {
> +		if (!ocelot_fdma_rx_get(fdma, budget))
> +			break;
> +
> +		work_done++;
> +	}
> +
> +	ocelot_fdma_rx_restart(fdma);
> +
> +	if (work_done < budget) {
> +		napi_complete_done(&fdma->napi, work_done);
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
> +				   BIT(MSCC_FDMA_INJ_CHAN) |
> +				   BIT(MSCC_FDMA_XTR_CHAN));
> +	}
> +
> +	return work_done;
> +}
> +
> +static irqreturn_t ocelot_fdma_interrupt(int irq, void *dev_id)
> +{
> +	u32 ident, llp, frm, err, err_code;
> +	struct ocelot_fdma *fdma = dev_id;
> +
> +	ident = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_IDENT);
> +	frm = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_FRM);
> +	llp = ocelot_fdma_readl(fdma, MSCC_FDMA_INTR_LLP);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, llp & ident);
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, frm & ident);
> +	if (frm || llp) {
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> +		napi_schedule(&fdma->napi);
> +	}
> +
> +	err = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR);
> +	if (unlikely(err)) {
> +		err_code = ocelot_fdma_readl(fdma, MSCC_FDMA_EVT_ERR_CODE);
> +		dev_err_ratelimited(fdma->dev,
> +				    "Error ! chans mask: %#x, code: %#x\n",
> +				    err, err_code);
> +
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR, err);
> +		ocelot_fdma_writel(fdma, MSCC_FDMA_EVT_ERR_CODE, err_code);
> +	}
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void ocelot_fdma_send_skb(struct ocelot_fdma *fdma, struct sk_buff *skb)
> +{
> +	struct ocelot_fdma_ring *ring = &fdma->inj;
> +	struct ocelot_fdma_dcb *dcb, *next;
> +
> +	dcb = &ring->dcbs[ring->tail];
> +	if (!ocelot_fdma_tx_dcb_set_skb(fdma, dcb, skb)) {
> +		dev_kfree_skb_any(skb);
> +		return;
> +	}
> +
> +	if (ocelot_fdma_ring_empty(&fdma->inj)) {

s/&fdma->inj/ring/

> +		ocelot_fdma_activate_chan(fdma, dcb, MSCC_FDMA_INJ_CHAN);
> +	} else {
> +		next = &ring->dcbs[ocelot_fdma_idx_incr(ring->tail)];
> +		dcb->hw->llp = next->hw_dma;
> +	}
> +
> +	ring->tail = ocelot_fdma_idx_incr(ring->tail);

You still have locking between TX and TX conf, that's too bad. Why is
that, I wonder? Because TX conf (ocelot_fdma_tx_cleanup) updates
ring->head and TX (ocelot_fdma_send_skb) updates ring->tail. Could it be
because you're updating the ring->tail _after_ you've activated the
injection channel, therefore exposing you to a race with the completion
interrupt which reads ring->tail?

> +
> +	skb_tx_timestamp(skb);
> +}
> +
> +static int ocelot_fdma_prepare_skb(struct ocelot_fdma *fdma, int port,
> +				   u32 rew_op, struct sk_buff *skb,
> +				   struct net_device *dev)
> +{
> +	int needed_headroom = max_t(int, OCELOT_TAG_LEN - skb_headroom(skb), 0);
> +	int needed_tailroom = max_t(int, ETH_FCS_LEN - skb_tailroom(skb), 0);
> +	struct ocelot_port *ocelot_port = fdma->ocelot->ports[port];
> +	void *ifh;
> +	int err;
> +
> +	if (unlikely(needed_headroom || needed_tailroom ||
> +		     skb_header_cloned(skb))) {
> +		err = pskb_expand_head(skb, needed_headroom, needed_tailroom,
> +				       GFP_ATOMIC);
> +		if (unlikely(err)) {
> +			dev_kfree_skb_any(skb);
> +			return 1;
> +		}
> +	}
> +
> +	err = skb_linearize(skb);
> +	if (err) {
> +		net_err_ratelimited("%s: skb_linearize error (%d)!\n",
> +				    dev->name, err);
> +		dev_kfree_skb_any(skb);
> +		return 1;
> +	}
> +
> +	ifh = skb_push(skb, OCELOT_TAG_LEN);
> +	skb_put(skb, ETH_FCS_LEN);
> +	ocelot_ifh_port_set(ifh, ocelot_port, rew_op, skb_vlan_tag_get(skb));
> +
> +	return 0;
> +}
> +
> +int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
> +			     struct sk_buff *skb, struct net_device *dev)
> +{
> +	int ret = NETDEV_TX_OK;
> +
> +	spin_lock(&fdma->xmit_lock);
> +
> +	if (ocelot_fdma_tx_free_count(fdma) == 0) {
> +		ret = NETDEV_TX_BUSY;

netif_stop_queue(dev);

> +		goto out;
> +	}
> +
> +	if (ocelot_fdma_prepare_skb(fdma, port, rew_op, skb, dev))
> +		goto out;
> +
> +	ocelot_fdma_send_skb(fdma, skb);
> +
> +out:
> +	spin_unlock(&fdma->xmit_lock);
> +
> +	return ret;
> +}
> +
> +static void ocelot_fdma_ring_free(struct ocelot_fdma *fdma,
> +				  struct ocelot_fdma_ring *ring)
> +{
> +	dmam_free_coherent(fdma->dev, OCELOT_DCBS_HW_ALLOC_SIZE, ring->hw_dcbs,
> +			   ring->hw_dcbs_dma);
> +}
> +
> +static int ocelot_fdma_ring_alloc(struct ocelot_fdma *fdma,
> +				  struct ocelot_fdma_ring *ring)
> +{
> +	struct ocelot_fdma_dcb_hw_v2 *hw_dcbs;
> +	struct ocelot_fdma_dcb *dcb;
> +	dma_addr_t hw_dcbs_dma;
> +	unsigned int adjust;
> +	int i;
> +
> +	/* Create a pool of consistent memory blocks for hardware descriptors */
> +	ring->hw_dcbs = dmam_alloc_coherent(fdma->dev,
> +					    OCELOT_DCBS_HW_ALLOC_SIZE,
> +					    &ring->hw_dcbs_dma, GFP_KERNEL);
> +	if (!ring->hw_dcbs)
> +		return -ENOMEM;
> +
> +	/* DCBs must be aligned on a 32bit boundary */
> +	hw_dcbs = ring->hw_dcbs;
> +	hw_dcbs_dma = ring->hw_dcbs_dma;
> +	if (!IS_ALIGNED(hw_dcbs_dma, 4)) {
> +		adjust = hw_dcbs_dma & 0x3;
> +		hw_dcbs_dma = ALIGN(hw_dcbs_dma, 4);
> +		hw_dcbs = (void *)hw_dcbs + adjust;
> +	}
> +
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		dcb->hw = &hw_dcbs[i];
> +		dcb->hw_dma = hw_dcbs_dma +
> +			     i * sizeof(struct ocelot_fdma_dcb_hw_v2);
> +	}
> +
> +	return 0;
> +}
> +
> +static int ocelot_fdma_rx_skb_alloc(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot_fdma_dcb *dcb, *prev_dcb = NULL;
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct sk_buff *skb;
> +	int i;
> +
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		skb = napi_alloc_skb(&fdma->napi, fdma->rx_buf_size);

I have to tell you, skb allocation at probe time is something I haven't
seen before, I'll have to defer to somebody else for a second opinion.
I understand that you keep the sk_buff structure closely tied to the DCB
structure, whereas normally you'd see only DCB structures in the RX ring.

And napi_alloc_skb? This isn't NAPI context.

The whole idea (as far as I understand it!) with a receive ring is that
hardware produces data on one side of it, and software consumes from
that side too and in the same direction, just lags behind a little bit.
Software must also provide buffers at the opposite end, in which
hardware will put the produced data, so that the pipeline never runs
out. You'll notice that it is standard amongst ring based drivers to
name your ring->head as ring->next_to_clean, and your ring->tail as
ring->next_to_use (actually not quite: your hardware doesn't really
provide producer and consumer indices, and the way in which you update
the ring->tail past the NULL LLP pointer is a bit different from the way
in which drivers use ring->next_to_use and more like ring->next_to_alloc,
but more on that later). I think people reviewing your code will instantly
know what it's about if you name and structure things in this way,
perhaps the lack of consumer and producer indices is merely an
irrelevant detail.

As long as the ring size is large enough in order to give software some
time to react, you can reach a state of equilibrium where you don't need
to allocate or DMA map any new buffer, you can just recycle the ones you
already have. To use buffer recycling, you need to replace alloc_skb()
with functions such as build_skb(), which builds an sk_buff structure
around a pre-existing buffer.

One simple way to achieve this is the page flipping technique. There may
be others and you don't have to use this one. The principle is as
follows: you allocate and DMA map buffers double the size you need
(PAGE_SIZE is a common choice, divided by two you get about 2K of data
per buffer, for frames larger than that you should do scatter/gather)
and you populate an RX DCB with just one half of that page, the other
half is unused for now. You repeat this process until you've filled up
the RX ring with sufficient DCB entries, and all of this happens at
initialization time.

Then, when you process a frame from the RX DCB ring (in the NAPI poll
function), you construct a skb around that page half, and before giving
it to the network stack, you speculatively attempt to reuse the other
half of the page by putting it back into the RX DCB ring, at the
opposite end compared to where you're consuming the other half from.
Ring-based drivers use a separate ring->next_to_alloc variable for this.
DCB elements ranging from ring->next_to_use and up to ring->next_to_alloc
are simply halves of pages that have been processed by your NAPI, and
are ready to be committed to hardware when you need to refill (in your
case update the LLP pointer).

So each page could have up to two users (references), and you need to be
very careful how you deal with concurrency with the stack. As Claudiu
explained in great detail to me when I was first studying this
mechanism, the driver is the producer of these RX pages and the network
stack is the consumer. And since the network stack will eventually call
kfree_skb() after it's done, which ultimately results into a put_page(),
you need to counteract that action and ensure that if you want to recycle
the other half of the page, the page's refcount never reaches zero.

As a result, if you deem the other page half as good for recycling, you
need to bump the reference count of the page from 1 to 2. This will make
it safe for you to refill the DCB ring with that other half.

Of course, the other half of the page might not be available for
recycling, this is why the technique is speculative. If the stack hasn't
yet called kfree_skb() on the other half, the page reference count will
be 2 at the time you're cleaning the buffer, and bad luck, you can no
longer reuse this page, so the driver needs to take its hands off of it.

If you look at driver implementations of the half page flip heuristic,
you'll see that "taking your hands off the page" means simply to DMA
unmap the entire page. This might surprise you, after all, the reference
count of the page is 2, you might think that the page will leak if you
don't decrement one of the references on it, or something. But if you
think about it, the reference count is 2 because one half has an skb
built at an earlier time around it, which is still in the network stack
there somewhere pending a kfree_skb(), and the other half is the buffer
you're cleaning right now. This half is practically promised to the
network stack, you'll create an skb around this half too, and the stack
will kfree it too, and the refcount of the page will thus drop to zero.

Of course, when you need to refill the RX ring with, say, 32 buffers,
you might find that (ring->next_to_alloc - ring->next_to_use) mod ring size
is less than 32 (otherwise said, the buffer reuse technique couldn't
provide enough buffers). So you need to alloc and DMA map new pages from
ring->next_to_alloc up to 32 (but this is still in contrast with your
current implementation which always calls napi_alloc_skb 32 times).
Nonetheless, in my experience, the technique works very well in real
life situations and is a very good way to reduce the pressure on the
memory allocator and avoid costly DMA mapping and unmapping per packet.
Since you've already went through the trouble of making this a
ring-based driver, I believe you should try to implement a buffer reuse
technique too, especially for this particular case of a very slow
processor.

> +		if (!skb)
> +			goto skb_alloc_failed;
> +
> +		ocelot_fdma_rx_set_skb(fdma, dcb, skb, fdma->rx_buf_size);
> +
> +		if (prev_dcb)
> +			prev_dcb->hw->llp = dcb->hw_dma;
> +
> +		prev_dcb = dcb;
> +	}
> +
> +	ring->head = 0;
> +	ring->tail = OCELOT_FDMA_MAX_DCB - 1;
> +
> +	return 0;
> +
> +skb_alloc_failed:
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		if (!dcb->skb)
> +			break;
> +
> +		dev_kfree_skb_any(dcb->skb);
> +	}
> +
> +	return -ENOMEM;
> +}
> +
> +static int ocelot_fdma_rx_init(struct ocelot_fdma *fdma)
> +{
> +	int ret;
> +
> +	fdma->rx_buf_size = ocelot_fdma_rx_buf_size(OCELOT_FDMA_RX_MTU);
> +
> +	ret = ocelot_fdma_rx_skb_alloc(fdma);
> +	if (ret) {
> +		netif_napi_del(&fdma->napi);
> +		return ret;
> +	}
> +
> +	napi_enable(&fdma->napi);
> +
> +	ocelot_fdma_activate_chan(fdma, &fdma->xtr.dcbs[0],
> +				  MSCC_FDMA_XTR_CHAN);
> +
> +	return 0;
> +}
> +
> +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev)
> +{
> +	dev->needed_headroom = OCELOT_TAG_LEN;
> +	dev->needed_tailroom = ETH_FCS_LEN;

The needed_headroom is in no way specific to FDMA, right? Why aren't you
doing it for manual register-based injection too? (in a separate patch ofc)

> +
> +	if (fdma->ndev)
> +		return;
> +
> +	fdma->ndev = dev;
> +	netif_napi_add(dev, &fdma->napi, ocelot_fdma_napi_poll,
> +		       OCELOT_FDMA_WEIGHT);

I understand that NAPI is per netdev but you have a single interrupt so
you need to share the NAPI instance for all ports. That is fine.
But danger ahead, see this:

mscc_ocelot_init_ports() does:

		err = ocelot_probe_port(ocelot, port, target, portnp);
		if (err) {
			ocelot_port_devlink_teardown(ocelot, port);
			continue;
		}

aka it skips over ports that failed to probe.
And ocelot_probe_port does:

	if (ocelot->fdma)
		ocelot_fdma_netdev_init(ocelot->fdma, dev);

	err = register_netdev(dev);
	if (err) {
		dev_err(ocelot->dev, "register_netdev failed\n");
		goto out;
	}

So if register_netdev() fails, you will have a dangling, freed pointer
inside fdma->ndev. That is not good, as far as I can tell. Try to make
the probing of your first port fail at register_netdev() time, to see
what I mean.

> +}
> +
> +void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma, struct net_device *dev)
> +{
> +	if (dev == fdma->ndev)
> +		netif_napi_del(&fdma->napi);
> +}
> +
> +struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
> +				     struct ocelot *ocelot)
> +{
> +	struct ocelot_fdma *fdma;
> +	void __iomem *base;
> +	int ret;
> +
> +	base = devm_platform_ioremap_resource_byname(pdev, "fdma");
> +	if (IS_ERR_OR_NULL(base))
> +		return NULL;
> +
> +	fdma = devm_kzalloc(&pdev->dev, sizeof(*fdma), GFP_KERNEL);
> +	if (!fdma)
> +		goto err_release_resource;
> +
> +	fdma->ocelot = ocelot;
> +	fdma->base = base;
> +	fdma->dev = &pdev->dev;
> +	fdma->dev->coherent_dma_mask = DMA_BIT_MASK(32);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> +
> +	fdma->irq = platform_get_irq_byname(pdev, "fdma");
> +	ret = devm_request_irq(&pdev->dev, fdma->irq, ocelot_fdma_interrupt, 0,
> +			       dev_name(&pdev->dev), fdma);
> +	if (ret)
> +		goto err_free_fdma;
> +
> +	ret = ocelot_fdma_ring_alloc(fdma, &fdma->inj);
> +	if (ret)
> +		goto err_free_irq;
> +
> +	ret = ocelot_fdma_ring_alloc(fdma, &fdma->xtr);
> +	if (ret)
> +		goto free_inj_ring;
> +
> +	return fdma;
> +
> +free_inj_ring:
> +	ocelot_fdma_ring_free(fdma, &fdma->inj);
> +err_free_irq:
> +	devm_free_irq(&pdev->dev, fdma->irq, fdma);
> +err_free_fdma:
> +	devm_kfree(&pdev->dev, fdma);
> +err_release_resource:
> +	devm_iounmap(&pdev->dev, base);
> +
> +	return NULL;
> +}
> +
> +int ocelot_fdma_start(struct ocelot_fdma *fdma)
> +{
> +	struct ocelot *ocelot = fdma->ocelot;
> +	int ret;
> +
> +	ret = ocelot_fdma_rx_init(fdma);
> +	if (ret)
> +		return -EINVAL;
> +
> +	/* Reconfigure for extraction and injection using DMA */
> +	ocelot_write_rix(ocelot, QS_INJ_GRP_CFG_MODE(2), QS_INJ_GRP_CFG, 0);
> +	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(0), QS_INJ_CTRL, 0);
> +
> +	ocelot_write_rix(ocelot, QS_XTR_GRP_CFG_MODE(2), QS_XTR_GRP_CFG, 0);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP, 0xffffffff);
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM, 0xffffffff);
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_LLP_ENA,
> +			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_FRM_ENA, BIT(MSCC_FDMA_XTR_CHAN));
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA,
> +			   BIT(MSCC_FDMA_INJ_CHAN) | BIT(MSCC_FDMA_XTR_CHAN));
> +
> +	return 0;
> +}
> +
> +int ocelot_fdma_stop(struct ocelot_fdma *fdma)

This should return void.

> +{
> +	struct ocelot_fdma_ring *ring = &fdma->xtr;
> +	struct ocelot_fdma_dcb *dcb;
> +	int i;
> +
> +	ocelot_fdma_writel(fdma, MSCC_FDMA_INTR_ENA, 0);
> +
> +	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_XTR_CHAN);
> +	ocelot_fdma_stop_channel(fdma, MSCC_FDMA_INJ_CHAN);
> +
> +	/* Free the SKB hold in the extraction ring */
> +	for (i = 0; i < OCELOT_FDMA_MAX_DCB; i++) {
> +		dcb = &ring->dcbs[i];
> +		dev_kfree_skb_any(dcb->skb);
> +	}
> +
> +	napi_synchronize(&fdma->napi);
> +	napi_disable(&fdma->napi);
> +
> +	return 0;
> +}
> diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.h b/drivers/net/ethernet/mscc/ocelot_fdma.h
> new file mode 100644
> index 000000000000..b6f1dda0e0c7
> --- /dev/null
> +++ b/drivers/net/ethernet/mscc/ocelot_fdma.h
> @@ -0,0 +1,96 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
> +/*
> + * Microsemi SoCs FDMA driver
> + *
> + * Copyright (c) 2021 Microchip
> + */
> +#ifndef _MSCC_OCELOT_FDMA_H_
> +#define _MSCC_OCELOT_FDMA_H_
> +
> +#include "ocelot.h"
> +
> +#define OCELOT_FDMA_MAX_DCB		128
> +/* +4 allows for word alignment after allocation */
> +#define OCELOT_DCBS_HW_ALLOC_SIZE	(OCELOT_FDMA_MAX_DCB * \
> +					 sizeof(struct ocelot_fdma_dcb_hw_v2) + \
> +					 4)
> +
> +struct ocelot_fdma_dcb_hw_v2 {
> +	u32 llp;
> +	u32 datap;
> +	u32 datal;
> +	u32 stat;
> +};

Could you declare this using __attribute((packed)) to show that you're
mapping it over hardware?

> +
> +/**
> + * struct ocelot_fdma_dcb - Software DCBs description
> + *
> + * @hw: hardware DCB used by hardware(coherent memory)
> + * @hw_dma: DMA address of the DCB
> + * @skb: skb associated with the DCB
> + * @mapping: Address of the skb data mapping
> + * @mapped_size: Mapped size
> + */
> +struct ocelot_fdma_dcb {
> +	struct ocelot_fdma_dcb_hw_v2	*hw;
> +	dma_addr_t			hw_dma;
> +	struct sk_buff			*skb;
> +	dma_addr_t			mapping;
> +	size_t				mapped_size;
> +};
> +
> +/**
> + * struct ocelot_fdma_ring - "Ring" description of DCBs
> + *
> + * @hw_dcbs: Hardware DCBs allocated for the ring
> + * @hw_dcbs_dma: DMA address of the DCBs
> + * @dcbs: List of software DCBs
> + * @head: pointer to first available DCB
> + * @tail: pointer to last available DCB
> + */
> +struct ocelot_fdma_ring {
> +	struct ocelot_fdma_dcb_hw_v2	*hw_dcbs;
> +	dma_addr_t			hw_dcbs_dma;
> +	struct ocelot_fdma_dcb		dcbs[OCELOT_FDMA_MAX_DCB];
> +	unsigned int			head;
> +	unsigned int			tail;
> +};
> +
> +/**
> + * struct ocelot_fdma - FMDA struct

s/FMDA/FDMA/

> + *
> + * @ocelot: Pointer to ocelot struct
> + * @base: base address of FDMA registers
> + * @irq: FDMA interrupt
> + * @dev: Ocelot device
> + * @napi: napi handle
> + * @rx_buf_size: Size of RX buffer
> + * @inj: Injection ring
> + * @xtr: Extraction ring
> + * @xmit_lock: Xmit lock
> + *
> + */
> +struct ocelot_fdma {
> +	struct ocelot			*ocelot;

To me, this structure organization in which "struct ocelot_fdma *" is
passed as argument to all FDMA functions, instead of "struct ocelot *",
is strange, and leads to oddities such as this backpointer right here.
Do it in whichever way you want, I'm just pointing this out.

> +	void __iomem			*base;
> +	int				irq;
> +	struct device			*dev;
> +	struct napi_struct		napi;
> +	struct net_device		*ndev;
> +	size_t				rx_buf_size;
> +	struct ocelot_fdma_ring		inj;
> +	struct ocelot_fdma_ring		xtr;
> +	spinlock_t			xmit_lock;
> +};
> +
> +struct ocelot_fdma *ocelot_fdma_init(struct platform_device *pdev,
> +				     struct ocelot *ocelot);
> +int ocelot_fdma_start(struct ocelot_fdma *fdma);
> +int ocelot_fdma_stop(struct ocelot_fdma *fdma);
> +int ocelot_fdma_inject_frame(struct ocelot_fdma *fdma, int port, u32 rew_op,
> +			     struct sk_buff *skb, struct net_device *dev);
> +void ocelot_fdma_netdev_init(struct ocelot_fdma *fdma, struct net_device *dev);
> +void ocelot_fdma_netdev_deinit(struct ocelot_fdma *fdma,
> +			       struct net_device *dev);
> +
> +#endif
> diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
> index b589ae95e29b..9dcaf421da12 100644
> --- a/drivers/net/ethernet/mscc/ocelot_net.c
> +++ b/drivers/net/ethernet/mscc/ocelot_net.c
> @@ -15,6 +15,7 @@
>  #include <net/pkt_cls.h>
>  #include "ocelot.h"
>  #include "ocelot_vcap.h"
> +#include "ocelot_fdma.h"
>  
>  #define OCELOT_MAC_QUIRKS	OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP
>  
> @@ -457,7 +458,7 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
>  	int port = priv->chip_port;
>  	u32 rew_op = 0;
>  
> -	if (!ocelot_can_inject(ocelot, 0))
> +	if (!ocelot->fdma && !ocelot_can_inject(ocelot, 0))
>  		return NETDEV_TX_BUSY;
>  
>  	/* Check if timestamping is needed */
> @@ -475,9 +476,13 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
>  		rew_op = ocelot_ptp_rew_op(skb);
>  	}
>  
> -	ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
> +	if (ocelot->fdma) {
> +		ocelot_fdma_inject_frame(ocelot->fdma, port, rew_op, skb, dev);
> +	} else {
> +		ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);

I can't help but think how painful it is that with a CPU as slow as
yours, insult over injury, you also need to check for each packet
whether the device tree had defined the "fdma" region or not, because
you practically keep two traffic I/O implementations due to that sole
reason. I think for the ocelot switchdev driver, which is strictly for
MIPS CPUs embedded within the device, it should be fine to introduce a
static key here (search for static_branch_likely in the kernel).

>  
> -	kfree_skb(skb);
> +		consume_skb(skb);
> +	}
>  
>  	return NETDEV_TX_OK;
>  }
> @@ -1717,6 +1722,9 @@ int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
>  	if (err)
>  		goto out;
>  
> +	if (ocelot->fdma)
> +		ocelot_fdma_netdev_init(ocelot->fdma, dev);
> +
>  	err = register_netdev(dev);
>  	if (err) {
>  		dev_err(ocelot->dev, "register_netdev failed\n");
> @@ -1737,9 +1745,13 @@ void ocelot_release_port(struct ocelot_port *ocelot_port)
>  	struct ocelot_port_private *priv = container_of(ocelot_port,
>  						struct ocelot_port_private,
>  						port);
> +	struct ocelot_fdma *fdma = ocelot_port->ocelot->fdma;
>  
>  	unregister_netdev(priv->dev);
>  
> +	if (fdma)
> +		ocelot_fdma_netdev_deinit(fdma, priv->dev);
> +
>  	if (priv->phylink) {
>  		rtnl_lock();
>  		phylink_disconnect_phy(priv->phylink);
> diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> index 38103b0255b0..fa68eb23a333 100644
> --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
> @@ -18,6 +18,7 @@
>  
>  #include <soc/mscc/ocelot_vcap.h>
>  #include <soc/mscc/ocelot_hsio.h>
> +#include "ocelot_fdma.h"
>  #include "ocelot.h"
>  
>  static const u32 ocelot_ana_regmap[] = {
> @@ -1080,6 +1081,10 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
>  		ocelot->targets[io_target[i].id] = target;
>  	}
>  
> +	ocelot->fdma = ocelot_fdma_init(pdev, ocelot);
> +	if (IS_ERR(ocelot->fdma))
> +		ocelot->fdma = NULL;
> +
>  	hsio = syscon_regmap_lookup_by_compatible("mscc,ocelot-hsio");
>  	if (IS_ERR(hsio)) {
>  		dev_err(&pdev->dev, "missing hsio syscon\n");
> @@ -1139,6 +1144,12 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
>  	if (err)
>  		goto out_ocelot_devlink_unregister;
>  
> +	if (ocelot->fdma) {
> +		err = ocelot_fdma_start(ocelot->fdma);
> +		if (err)
> +			goto out_ocelot_release_ports;
> +	}
> +
>  	err = ocelot_devlink_sb_register(ocelot);
>  	if (err)
>  		goto out_ocelot_release_ports;
> @@ -1179,6 +1190,8 @@ static int mscc_ocelot_remove(struct platform_device *pdev)
>  {
>  	struct ocelot *ocelot = platform_get_drvdata(pdev);
>  
> +	if (ocelot->fdma)
> +		ocelot_fdma_stop(ocelot->fdma);
>  	devlink_unregister(ocelot->devlink);
>  	ocelot_deinit_timestamp(ocelot);
>  	ocelot_devlink_sb_unregister(ocelot);
> diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
> index b3381c90ff3e..351ab385ab98 100644
> --- a/include/soc/mscc/ocelot.h
> +++ b/include/soc/mscc/ocelot.h
> @@ -695,6 +695,8 @@ struct ocelot {
>  	/* Protects the PTP clock */
>  	spinlock_t			ptp_clock_lock;
>  	struct ptp_pin_desc		ptp_pins[OCELOT_PTP_PINS_NUM];
> +
> +	struct ocelot_fdma		*fdma;
>  };
>  
>  struct ocelot_policer {
> @@ -761,6 +763,8 @@ void ocelot_ifh_port_set(void *ifh, struct ocelot_port *port, u32 rew_op,
>  			 u32 vlan_tag);
>  int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
>  void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
> +void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
> +			     u64 timestamp);
>  
>  /* Hardware initialization */
>  int ocelot_regfields_init(struct ocelot *ocelot,
> -- 
> 2.33.1
>