Re: RT Latency on Atmel Processors

Remy Bohmer <linux@xxxxxxxxxx> · Tue, 29 Sep 2009 23:19:26 +0200

Hi,

2009/9/29 Azraiyl <azraiyl@xxxxxxxxx>:
>> introduces inside the system. Look at the CPU-load this is
>> generating... High latencies would not surprise me since it is likely
>> that the processor just cannot keep up...
>
> With some simple work inside the loop the max. cpu usage is 19%.
> Normally it's ca. 15%.

I would expect the load is much higher... Do you have a proper
sched_clock() implementation or the default fallback-to-jiffies based
sched_clock()? Load statistics might be misleading...
See attached for a preliminary patch... (using it on a sam9261 core)

>> Better use the TC-library to generate a dedicated interrupt if you
>> want some realtime responsiveness and somewhat reasonable CPU-load.
>> Personally I would not go beyond the 1kHz boundary with this
>> processor...
>
> I'll try this. Thanks for the hint.
>
> Anyway, I'm still worried about these worst case latencies and like to
> know from where
> they come from. Is the ftrace infrastructure supposed to work on ARM?

I used it for tracing sched_switches on 2.6.31 and it works there. Not
tested it myself on 2.6.29 though.

Remy
Add sched_clock to AT91 clocksource driver

Without this patch the tools like 'top' will display a far too low CPU-load usage.
However, on AT91 there is no architecture specific sched_clock() implementation, so
the default fallback is used. This fallback uses the jiffie counter as sched_clock()

On AT91 there is NO standard clocksource available that is acurate enough, except the
TC-based clocksource implementation. Therefor this implementation is used as base for
the sched_clock(). This clocksource offers sub-millisecond timestamping. (< 200 ns)

Signed-off-by: Remy Bohmer <linux@xxxxxxxxxx>
---
 drivers/clocksource/tcb_clksrc.c |   68 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 7 deletions(-)

Index: linux-2.6.31/drivers/clocksource/tcb_clksrc.c
===================================================================

--- linux-2.6.31.orig/drivers/clocksource/tcb_clksrc.c	2009-09-29 23:03:21.000000000 +0200
+++ linux-2.6.31/drivers/clocksource/tcb_clksrc.c	2009-09-29 23:09:19.000000000 +0200
@@ -11,6 +11,7 @@
 #include <linux/platform_device.h>
 #include <linux/atmel_tc.h>
 
+#include <linux/sched.h> /* for sched_clock() prototype */
 
 /*
  * We're configured to use a specific TC block, one that's not hooked
@@ -38,19 +39,22 @@
  */
 
 static void __iomem *tcaddr;
+static int clocksource_initialised;
+static unsigned long long nsecs_per_clock;
+static DEFINE_ATOMIC_SPINLOCK(sched_clock_lock);
 
 static cycle_t tc_get_cycles(struct clocksource *cs)
 {
 	unsigned long	flags;
 	u32		lower, upper;
 
-	raw_local_irq_save(flags);
+	atomic_spin_lock_irqsave(&sched_clock_lock, flags);
 	do {
 		upper = __raw_readl(tcaddr + ATMEL_TC_REG(1, CV));
 		lower = __raw_readl(tcaddr + ATMEL_TC_REG(0, CV));
 	} while (upper != __raw_readl(tcaddr + ATMEL_TC_REG(1, CV)));
 
-	raw_local_irq_restore(flags);
+	atomic_spin_unlock_irqrestore(&sched_clock_lock, flags);
 	return (upper << 16) | lower;
 }
 
@@ -63,6 +67,53 @@ static struct clocksource clksrc = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
+/* Overload the sched_clock() implementation */
+unsigned long long sched_clock(void)
+{
+	unsigned long           	flags;
+	unsigned long long		cycles;
+	u32				upper32, lower32;
+	u32				cycles32;
+	static u32			prev_cycles32;
+	static unsigned long long	upper64;
+
+	if (clocksource_initialised) {
+		/* Transfer the 32 bits cycles to a 64 bits cycles. We
+		   assume being called faster than once every 5.726 minutes...
+		   (this time is derived from a clock rate at 12.5 MHz) */
+		atomic_spin_lock_irqsave(&sched_clock_lock, flags);
+
+		do {
+			upper32 = __raw_readl(tcaddr + ATMEL_TC_REG(1, CV));
+			lower32 = __raw_readl(tcaddr + ATMEL_TC_REG(0, CV));
+			/* Make sure the low counter does not wrap while
+			   reading the time */
+		} while (upper32 != __raw_readl(tcaddr + ATMEL_TC_REG(1, CV)));
+
+		cycles32 = (upper32 << 16) | lower32;
+
+		if (cycles32 < prev_cycles32) {
+			/* Wrap around detected, or a jump in time backwards
+			   of the lower 16 bits device? Ignore those.
+			   REVISIT: Unfortunately we have seen these. Without
+			   this check you will see the printk clock make huge
+			   jumps in time forward during boot. */
+			if ((prev_cycles32 - cycles32) > (1 << 16))
+				upper64 += 1LLU << 32; /* A full wrap around */
+		}
+		prev_cycles32 = cycles32;
+
+		cycles = upper64 | (unsigned long long)cycles32;
+
+		atomic_spin_unlock_irqrestore(&sched_clock_lock, flags);
+
+		cycles *= nsecs_per_clock;
+		return cycles;
+	} else {
+		return 0;
+	}
+}
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 
 struct tc_clkevt_device {
@@ -212,9 +263,6 @@ static void __init setup_clkevents(struc
 
 static int __init tcb_clksrc_init(void)
 {
-	static char bootinfo[] __initdata
-		= KERN_DEBUG "%s: tc%d at %d.%03d MHz\n";
-
 	struct platform_device *pdev;
 	struct atmel_tc *tc;
 	struct clk *t0_clk;
@@ -258,9 +306,11 @@ static int __init tcb_clksrc_init(void)
 
 	clksrc.mult = clocksource_hz2mult(divided_rate, clksrc.shift);
 
-	printk(bootinfo, clksrc.name, CONFIG_ATMEL_TCB_CLKSRC_BLOCK,
+	printk(KERN_DEBUG "%s: tc%d at %d.%03d MHz\n",
+			clksrc.name, CONFIG_ATMEL_TCB_CLKSRC_BLOCK,
 			divided_rate / 1000000,
-			((divided_rate + 500000) % 1000000) / 1000);
+			(divided_rate -
+			 ((divided_rate / 1000000) * 1000000)) / 1000);
 
 	/* tclib will give us three clocks no matter what the
 	 * underlying platform supports.
@@ -297,6 +347,10 @@ static int __init tcb_clksrc_init(void)
 	/* channel 2:  periodic and oneshot timer support */
 	setup_clkevents(tc, clk32k_divisor_idx);
 
+	/* Calculate the time per clocktick. Needed for sched_clock() */
+	nsecs_per_clock = (1000 * 1000 * 1000) / divided_rate;
+	clocksource_initialised = 1;
+
 	return 0;
 }
 arch_initcall(tcb_clksrc_init);