Re: [PATCH 2/3] can: m_can: add bus error handling

Dong Aisheng <b29396@xxxxxxxxxxxxx> · Wed, 2 Jul 2014 14:31:51 +0800

On Tue, Jul 01, 2014 at 12:37:07PM +0200, Marc Kleine-Budde wrote:
> On 06/27/2014 12:00 PM, Dong Aisheng wrote:
> > Add bus error, state change, lost message handling mechanism.
> > 
> > Signed-off-by: Dong Aisheng <b29396@xxxxxxxxxxxxx>
> > ---
> >  drivers/net/can/m_can.c |  271 +++++++++++++++++++++++++++++++++++++++++------
> >  1 files changed, 240 insertions(+), 31 deletions(-)
> > 
> > diff --git a/drivers/net/can/m_can.c b/drivers/net/can/m_can.c
> > index 61e9a8e..e4aed71 100644
> > --- a/drivers/net/can/m_can.c
> > +++ b/drivers/net/can/m_can.c
> > @@ -83,6 +83,18 @@ enum m_can_reg {
> >  	M_CAN_TXEFA	= 0xf8,
> >  };
> >  
> > +/* m_can lec values */
> > +enum m_can_lec_type {
> > +	LEC_NO_ERROR = 0,
> > +	LEC_STUFF_ERROR,
> > +	LEC_FORM_ERROR,
> > +	LEC_ACK_ERROR,
> > +	LEC_BIT1_ERROR,
> > +	LEC_BIT0_ERROR,
> > +	LEC_CRC_ERROR,
> > +	LEC_UNUSED,
> > +};
> > +
> >  /* CC Control Register(CCCR) */
> >  #define CCCR_CCE	BIT(1)
> >  #define CCCR_INIT	BIT(0)
> > @@ -97,6 +109,19 @@ enum m_can_reg {
> >  #define BTR_SJW_SHIFT		0
> >  #define BTR_SJW_MASK		0xf
> >  
> > +/* Error Counter Register(ECR) */
> > +#define ECR_RP			BIT(15)
> > +#define ECR_REC_SHIFT		8
> > +#define ECR_REC_MASK		(0x7f << ECR_REC_SHIFT)
> > +#define ECR_TEC_SHIFT		0
> > +#define ECR_TEC_MASK		0xff
> > +
> > +/* Protocol Status Register(PSR) */
> > +#define PSR_BO		BIT(7)
> > +#define PSR_EW		BIT(6)
> > +#define PSR_EP		BIT(5)
> > +#define PSR_LEC_MASK	0x7
> > +
> >  /* Interrupt Register(IR) */
> >  #define IR_ALL_INT	0xffffffff
> >  #define IR_STE		BIT(31)
> > @@ -131,10 +156,11 @@ enum m_can_reg {
> >  #define IR_RF0F		BIT(2)
> >  #define IR_RF0W		BIT(1)
> >  #define IR_RF0N		BIT(0)
> > -#define IR_ERR_ALL	(IR_STE	| IR_FOE | IR_ACKE | IR_BE | IR_CRCE | \
> > -		IR_WDI | IR_BO | IR_EW | IR_EP | IR_ELO | IR_BEU | \
> > -		IR_BEC | IR_TOO | IR_MRAF | IR_TSW | IR_TEFL | IR_RF1L | \
> > -		IR_RF0L)
> > +#define IR_ERR_STATE	(IR_BO | IR_EW | IR_EP)
> > +#define IR_ERR_BUS	(IR_STE	| IR_FOE | IR_ACKE | IR_BE | IR_CRCE | \
> > +		IR_WDI | IR_ELO | IR_BEU | IR_BEC | IR_TOO | IR_MRAF | \
> > +		IR_TSW | IR_TEFL | IR_RF1L | IR_RF0L)
> > +#define IR_ERR_ALL	(IR_ERR_STATE | IR_ERR_BUS)
> >  
> >  /* Rx FIFO 0/1 Configuration (RXF0C/RXF1C) */
> >  #define RXFC_FWM_OFF	24
> > @@ -320,12 +346,175 @@ static int m_can_do_rx_poll(struct net_device *dev, int quota)
> >  	return num_rx_pkts;
> >  }
> >  
> > +static int m_can_handle_lost_msg(struct net_device *dev)
> > +{
> > +	struct net_device_stats *stats = &dev->stats;
> > +	struct sk_buff *skb;
> > +	struct can_frame *frame;
> > +
> > +	netdev_err(dev, "msg lost in rxf0\n");
> > +
> > +	skb = alloc_can_err_skb(dev, &frame);
> > +	if (unlikely(!skb))
> > +		return 0;
> 
> Please rearrange this function differently, so that the stats are
> updated, even if alloc_can_err_skb() fails.
> 

Good suggestion.
Will change them all with other places having the same issue.

> > +
> > +	frame->can_id |= CAN_ERR_CRTL;
> > +	frame->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
> > +	stats->rx_errors++;
> > +	stats->rx_over_errors++;
> > +
> > +	netif_receive_skb(skb);
> > +
> > +	return 1;
> > +}
> > +
> > +static int m_can_handle_bus_err(struct net_device *dev,
> > +				enum m_can_lec_type lec_type)
> > +{
> > +	struct m_can_priv *priv = netdev_priv(dev);
> > +	struct net_device_stats *stats = &dev->stats;
> > +	struct can_frame *cf;
> > +	struct sk_buff *skb;
> > +
> > +	/* early exit if no lec update */
> > +	if (lec_type == LEC_UNUSED)
> > +		return 0;
> > +
> > +	/* propagate the error condition to the CAN stack */
> > +	skb = alloc_can_err_skb(dev, &cf);
> > +	if (unlikely(!skb))
> > +		return 0;
> 
> same here
> 
> > +
> > +	/*
> > +	 * check for 'last error code' which tells us the
> > +	 * type of the last error to occur on the CAN bus
> > +	 */
> > +	priv->can.can_stats.bus_error++;
> > +	stats->rx_errors++;
> > +	cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
> > +	cf->data[2] |= CAN_ERR_PROT_UNSPEC;
> > +
> > +	switch (lec_type) {
> > +	case LEC_STUFF_ERROR:
> > +		netdev_dbg(dev, "stuff error\n");
> > +		cf->data[2] |= CAN_ERR_PROT_STUFF;
> > +		break;
> > +	case LEC_FORM_ERROR:
> > +		netdev_dbg(dev, "form error\n");
> > +		cf->data[2] |= CAN_ERR_PROT_FORM;
> > +		break;
> > +	case LEC_ACK_ERROR:
> > +		netdev_dbg(dev, "ack error\n");
> > +		cf->data[3] |= (CAN_ERR_PROT_LOC_ACK |
> > +				CAN_ERR_PROT_LOC_ACK_DEL);
> > +		break;
> > +	case LEC_BIT1_ERROR:
> > +		netdev_dbg(dev, "bit1 error\n");
> > +		cf->data[2] |= CAN_ERR_PROT_BIT1;
> > +		break;
> > +	case LEC_BIT0_ERROR:
> > +		netdev_dbg(dev, "bit0 error\n");
> > +		cf->data[2] |= CAN_ERR_PROT_BIT0;
> > +		break;
> > +	case LEC_CRC_ERROR:
> > +		netdev_dbg(dev, "CRC error\n");
> > +		cf->data[3] |= (CAN_ERR_PROT_LOC_CRC_SEQ |
> > +				CAN_ERR_PROT_LOC_CRC_DEL);
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +
> > +	netif_receive_skb(skb);
> > +	stats->rx_packets++;
> > +	stats->rx_bytes += cf->can_dlc;
> > +
> > +	return 1;
> > +}
> > +
> > +static int m_can_get_berr_counter(const struct net_device *dev,
> > +				  struct can_berr_counter *bec)
> > +{
> > +	struct m_can_priv *priv = netdev_priv(dev);
> > +	unsigned int ecr;
> 
> This function might be called, even if the interface is down. You might
> have to enable the clock(s) here.
> 

Ok, got it, thanks for the info.

> > +	ecr = m_can_read(priv, M_CAN_ECR);
> > +	bec->rxerr = (ecr & ECR_REC_MASK) >> ECR_REC_SHIFT;
> > +	bec->txerr = ecr & ECR_TEC_MASK;
> > +
> > +	return 0;
> > +}
> > +
> > +static int m_can_handle_state_change(struct net_device *dev,
> > +				enum can_state new_state)
> > +{
> > +	struct m_can_priv *priv = netdev_priv(dev);
> > +	struct net_device_stats *stats = &dev->stats;
> > +	struct can_frame *cf;
> > +	struct sk_buff *skb;
> > +	struct can_berr_counter bec;
> > +	unsigned int ecr;
> > +
> > +	/* propagate the error condition to the CAN stack */
> > +	skb = alloc_can_err_skb(dev, &cf);
> > +	if (unlikely(!skb))
> > +		return 0;
> 
> Please rearrange the function, so that the stats and more important
> bus-off are handled correctly, even if this fails.
> 

Will do it.

> > +
> > +	m_can_get_berr_counter(dev, &bec);
> > +
> > +	switch (new_state) {
> > +	case CAN_STATE_ERROR_ACTIVE:
> > +		/* error warning state */
> > +		priv->can.can_stats.error_warning++;
> > +		priv->can.state = CAN_STATE_ERROR_WARNING;
> > +		cf->can_id |= CAN_ERR_CRTL;
> > +		cf->data[1] = (bec.txerr > bec.rxerr) ?
> > +			CAN_ERR_CRTL_TX_WARNING :
> > +			CAN_ERR_CRTL_RX_WARNING;
> > +		cf->data[6] = bec.txerr;
> > +		cf->data[7] = bec.rxerr;
> > +		break;
> > +	case CAN_STATE_ERROR_PASSIVE:
> > +		/* error passive state */
> > +		priv->can.can_stats.error_passive++;
> > +		priv->can.state = CAN_STATE_ERROR_PASSIVE;
> > +		cf->can_id |= CAN_ERR_CRTL;
> > +		ecr = m_can_read(priv, M_CAN_ECR);
> > +		if (ecr & ECR_RP)
> > +			cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
> > +		if (bec.txerr > 127)
> > +			cf->data[1] |= CAN_ERR_CRTL_TX_PASSIVE;
> > +		cf->data[6] = bec.txerr;
> > +		cf->data[7] = bec.rxerr;
> > +		break;
> > +	case CAN_STATE_BUS_OFF:
> > +		/* bus-off state */
> > +		priv->can.state = CAN_STATE_BUS_OFF;
> > +		cf->can_id |= CAN_ERR_BUSOFF;
> > +		/*
> > +		 * disable all interrupts in bus-off mode to ensure that
> > +		 * the CPU is not hogged down
> > +		 */
> > +		m_can_enable_all_interrupts(priv, false);
> > +		can_bus_off(dev);
> > +		break;
> > +	default:
> > +		break;
> > +	}
> > +
> > +	netif_receive_skb(skb);
> > +	stats->rx_packets++;
> > +	stats->rx_bytes += cf->can_dlc;
> 
> don't acces "cf" after netif_receive_sb();
> 

Got it.

> > +
> > +	return 1;
> > +}
> > +
> >  static int m_can_poll(struct napi_struct *napi, int quota)
> >  {
> >  	struct net_device *dev = napi->dev;
> >  	struct m_can_priv *priv = netdev_priv(dev);
> >  	u32 work_done = 0;
> > -	u32 irqstatus;
> > +	u32 irqstatus, psr;
> >  
> >  	irqstatus = m_can_read(priv, M_CAN_IR);
> >  	if (irqstatus)
> > @@ -337,6 +526,48 @@ static int m_can_poll(struct napi_struct *napi, int quota)
> >  	if (!irqstatus)
> >  		goto end;
> >  
> > +	psr = m_can_read(priv, M_CAN_PSR);
> > +	if (irqstatus & IR_ERR_STATE) {
> > +		if ((psr & PSR_EW) &&
> > +			(priv->can.state != CAN_STATE_ERROR_WARNING)) {
> > +			netdev_dbg(dev, "entered error warning state\n");
> > +			work_done += m_can_handle_state_change(dev,
> > +					CAN_STATE_ERROR_WARNING);
> > +		}
> > +
> > +		if ((psr & PSR_EP) &&
> > +			(priv->can.state != CAN_STATE_ERROR_PASSIVE)) {
> > +			netdev_dbg(dev, "entered error warning state\n");
> > +			work_done += m_can_handle_state_change(dev,
> > +					CAN_STATE_ERROR_PASSIVE);
> > +		}
> > +
> > +		if ((psr & PSR_BO) &&
> > +			(priv->can.state != CAN_STATE_BUS_OFF)) {
> > +			netdev_dbg(dev, "entered error warning state\n");
> > +			work_done += m_can_handle_state_change(dev,
> > +					CAN_STATE_BUS_OFF);
> > +		}
> 
> You might want to push this into a seperate function.
> 

Yes, could do like that.

> > +	}
> > +
> > +	if (irqstatus & IR_ERR_BUS) {
> > +		if (irqstatus & IR_RF0L)
> > +			work_done += m_can_handle_lost_msg(dev);
> > +
> > +		/* handle lec errors on the bus */
> > +		if (psr & LEC_UNUSED)
> > +			work_done += m_can_handle_bus_err(dev,
> > +					psr & LEC_UNUSED);
> > +
> > +		/* other unproccessed error interrupts */
> > +		if (irqstatus & IR_WDI)
> > +			netdev_err(dev, "Message RAM Watchdog event due to missing READY\n");
> > +		if (irqstatus & IR_TOO)
> > +			netdev_err(dev, "Timeout reached\n");
> > +		if (irqstatus & IR_MRAF)
> > +			netdev_err(dev, "Message RAM access failure occurred\n");
> 
> same here

Got it.

> > +	}
> > +
> >  	if (irqstatus & IR_RF0N)
> >  		/* handle events corresponding to receive message objects */
> >  		work_done += m_can_do_rx_poll(dev, (quota - work_done));
> > @@ -369,31 +600,18 @@ static irqreturn_t m_can_isr(int irq, void *dev_id)
> >  	if (ir & IR_ALL_INT)
> >  		m_can_write(priv, M_CAN_IR, ir);
> >  
> > -	if (ir & IR_ERR_ALL) {
> > -		netdev_dbg(dev, "bus error\n");
> > -		/* TODO: handle bus error */
> > -	}
> > -
> > -	/* save irqstatus for later using */
> > -	priv->irqstatus = ir;
> > -
> >  	/*
> >  	 * schedule NAPI in case of
> >  	 * - rx IRQ
> > -	 * - state change IRQ(TODO)
> > -	 * - bus error IRQ and bus error reporting (TODO)
> > +	 * - state change IRQ
> > +	 * - bus error IRQ and bus error reporting
> >  	 */
> > -	if (ir & IR_RF0N) {
> > +	if ((ir & IR_RF0N) || (ir & IR_ERR_ALL)) {
> > +		priv->irqstatus = ir;
> >  		m_can_enable_all_interrupts(priv, false);
> >  		napi_schedule(&priv->napi);
> >  	}
> >  
> > -	/* FIFO overflow */
> > -	if (ir & IR_RF0L) {
> > -		dev->stats.rx_over_errors++;
> > -		dev->stats.rx_errors++;
> > -	}
> > -
> >  	/* transmission complete interrupt */
> >  	if (ir & IR_TC) {
> >  		netdev_dbg(dev, "tx complete\n");
> > @@ -446,7 +664,6 @@ static int m_can_set_bittiming(struct net_device *dev)
> >   * - setup bittiming
> >   * - TODO:
> >   *   1) other working modes support like monitor, loopback...
> > - *   2) lec error status report enable
> >   */
> >  static void m_can_chip_config(struct net_device *dev)
> >  {
> > @@ -515,14 +732,6 @@ static int m_can_set_mode(struct net_device *dev, enum can_mode mode)
> >  	return 0;
> >  }
> >  
> > -static int m_can_get_berr_counter(const struct net_device *dev,
> > -				  struct can_berr_counter *bec)
> > -{
> > -	/* TODO */
> > -
> > -	return 0;
> > -}
> > -
> >  static void free_m_can_dev(struct net_device *dev)
> >  {
> >  	free_candev(dev);
> > 
> 
> Marc
> 

Thanks

Regards
Dong Aisheng

> -- 
> Pengutronix e.K.                  | Marc Kleine-Budde           |
> Industrial Linux Solutions        | Phone: +49-231-2826-924     |
> Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
> Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |
> 

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html