Am 08.11.18 um 08:06 schrieb Lukas Wunner: > When in DMA mode, the BCM2835 SPI controller requires that the FIFO is > accessed in 4 byte chunks. This rule is not fulfilled if a transfer > consists of multiple sglist entries, one per page, and the first entry > starts in the middle of a page with an offset not a multiple of 4. > > The driver currently falls back to programmed I/O for such transfers, > incurring a significant performance penalty. > > Overcome this hardware limitation by transferring the first few bytes of > a transfer without DMA such that the remainder of the first sglist entry > becomes a multiple of 4. Specifics are provided in kerneldoc comments. > > An alternative approach would have been to split transfers in the > ->prepare_message hook, but this may necessitate two transfers per page, > defeating the goal of clustering multiple pages together in a single > transfer for efficiency. E.g. if the first TX sglist entry's length is > 23 and the first RX's is 40, the first transfer would send and receive > 23 bytes, the second 40 - 23 = 17 bytes, the third 4096 - 17 = 4079 > bytes, the fourth 4096 - 4079 = 17 bytes and so on. In other words, > O(n) transfers are necessary (n = number of sglist entries), whereas > the algorithm implemented herein only requires O(1) additional work. > > Signed-off-by: Lukas Wunner <lukas@xxxxxxxxx> > Cc: Mathias Duckeck <m.duckeck@xxxxxxxxx> > Cc: Frank Pavlic <f.pavlic@xxxxxxxxx> > Cc: Martin Sperl <kernel@xxxxxxxxxxxxxxxx> > Cc: Noralf Trønnes <noralf@xxxxxxxxxxx> > --- > drivers/spi/spi-bcm2835.c | 291 +++++++++++++++++++++++++++++++------- > 1 file changed, 242 insertions(+), 49 deletions(-) > > diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c > index 9b9b9926a956..36719d2cc12d 100644 > --- a/drivers/spi/spi-bcm2835.c > +++ b/drivers/spi/spi-bcm2835.c > @@ -85,20 +85,30 @@ > * @regs: base address of register map > * @clk: core clock, divided to calculate serial clock > * @irq: interrupt, signals TX FIFO empty or RX FIFO ¾ full > + * @tfr: SPI transfer currently processed > * @tx_buf: pointer whence next transmitted byte is read > * @rx_buf: pointer where next received byte is written > * @tx_len: remaining bytes to transmit > * @rx_len: remaining bytes to receive > + * @tx_prologue: bytes transmitted without DMA if first TX sglist entry's > + * length is not a multiple of 4 (to overcome hardware limitation) > + * @rx_prologue: bytes received without DMA if first RX sglist entry's > + * length is not a multiple of 4 (to overcome hardware limitation) > + * @tx_spillover: whether @tx_prologue spills over to second TX sglist entry > * @dma_pending: whether a DMA transfer is in progress > */ > struct bcm2835_spi { > void __iomem *regs; > struct clk *clk; > int irq; > + struct spi_transfer *tfr; > const u8 *tx_buf; > u8 *rx_buf; > int tx_len; > int rx_len; > + int tx_prologue; > + int rx_prologue; > + bool tx_spillover; > bool dma_pending; > }; > > @@ -137,6 +147,72 @@ static inline void bcm2835_wr_fifo(struct bcm2835_spi *bs) > } > } > > +/** > + * bcm2835_rd_fifo_count() - blindly read exactly @count bytes from RX FIFO > + * @bs: BCM2835 SPI controller > + * @count: bytes to read from RX FIFO > + * > + * The caller must ensure that @bs->rx_len is greater than or equal to @count, > + * that the RX FIFO contains at least @count bytes and that the DMA Enable flag > + * in the CS register is set (such that a read from the FIFO register receives > + * 32-bit instead of just 8-bit). > + */ > +static inline void bcm2835_rd_fifo_count(struct bcm2835_spi *bs, int count) > +{ > + u32 val; > + > + bs->rx_len -= count; > + > + while (count > 0) { > + val = bcm2835_rd(bs, BCM2835_SPI_FIFO); > + if (bs->rx_buf) { > + int len = min(count, 4); > + memcpy(bs->rx_buf, &val, len); > + bs->rx_buf += len; > + } > + count -= 4; > + } > +} > + > +/** > + * bcm2835_wr_fifo_count() - blindly write exactly @count bytes to TX FIFO > + * @bs: BCM2835 SPI controller > + * @count: bytes to write to TX FIFO > + * > + * The caller must ensure that @bs->tx_len is greater than or equal to @count, > + * that the TX FIFO can accommodate @count bytes and that the DMA Enable flag > + * in the CS register is set (such that a write to the FIFO register transmits > + * 32-bit instead of just 8-bit). > + */ > +static inline void bcm2835_wr_fifo_count(struct bcm2835_spi *bs, int count) > +{ > + u32 val; > + > + bs->tx_len -= count; > + > + while (count > 0) { > + if (bs->tx_buf) { > + int len = min(count, 4); > + memcpy(&val, bs->tx_buf, len); > + bs->tx_buf += len; > + } else { > + val = 0; > + } > + bcm2835_wr(bs, BCM2835_SPI_FIFO, val); > + count -= 4; > + } > +} > + > +/** > + * bcm2835_wait_tx_fifo_empty() - busy-wait for TX FIFO to empty > + * @bs: BCM2835 SPI controller > + */ > +static inline void bcm2835_wait_tx_fifo_empty(struct bcm2835_spi *bs) > +{ > + while (!(bcm2835_rd(bs, BCM2835_SPI_CS) & BCM2835_SPI_CS_DONE)) > + cpu_relax(); > +} Can we have some kind of timeout here, so we never spin forever in case hw doesn't behave as expected?