Re: [kvm-unit-tests PATCH] arm64: add micro test

Shih-Wei Li <shihwei@xxxxxxxxxxxxxxx> · Mon, 18 Dec 2017 16:32:24 -0500

On Mon, Dec 18, 2017 at 12:31 PM, Yury Norov <ynorov@xxxxxxxxxxxxxxxxxx> wrote:
> Hi Shih-Wei,
>
> (CC Cavium folks)
>
> Thank you for the test! I was going to write something like
> this, and instead found your patch submitted - good luck to me.
>
> Soon I'll test my hardware with it and share you results.
> Comments inline.
>
> Yury
>
> On Fri, Dec 15, 2017 at 04:15:39PM -0500, Shih-Wei Li wrote:
>> Here we provide the support for measuring various micro level
>> operations on arm64. We iterate each of the tests for millions of
>> times and output their average, minimum and maximum cost in timer
>> counts. Instruction barriers are used before and after taking
>> timestamps to avoid out-of-order execution or pipelining from
>> skewing our measurements.
>>
>> The tests we currently support and measure are mostly
>> straightforward by the function names and the respective comments.
>> For IPI test, we measure the cost of sending IPI from a source
>> VCPU to a target VCPU, until the target VCPU receives the IPI.
>>
>> Signed-off-by: Shih-Wei Li <shihwei@xxxxxxxxxxxxxxx>
>> ---
>>  arm/Makefile.common |   1 +
>>  arm/micro-test.c    | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  arm/unittests.cfg   |   6 ++
>>  3 files changed, 296 insertions(+)
>>  create mode 100644 arm/micro-test.c
>>
>> diff --git a/arm/Makefile.common b/arm/Makefile.common
>> index 0a039cf..c7d5c27 100644
>> --- a/arm/Makefile.common
>> +++ b/arm/Makefile.common
>> @@ -16,6 +16,7 @@ tests-common += $(TEST_DIR)/pmu.flat
>>  tests-common += $(TEST_DIR)/gic.flat
>>  tests-common += $(TEST_DIR)/psci.flat
>>  tests-common += $(TEST_DIR)/sieve.flat
>> +tests-common += $(TEST_DIR)/micro-test.flat
>>
>>  tests-all = $(tests-common) $(tests)
>>  all: directories $(tests-all)
>> diff --git a/arm/micro-test.c b/arm/micro-test.c
>> new file mode 100644
>> index 0000000..7df2272
>> --- /dev/null
>> +++ b/arm/micro-test.c
>> @@ -0,0 +1,289 @@
>> +#include <util.h>
>> +#include <asm/gic.h>
>> +
>> +static volatile bool second_cpu_up;
>> +static volatile bool first_cpu_ack;
>> +static volatile bool ipi_acked;
>> +static volatile bool ipi_received;
>> +static volatile bool ipi_ready;
>> +#define IPI_IRQ              1
>> +
>> +#define TIMEOUT (1U << 28)
>> +
>> +#define ARR_SIZE(_x) ((int)(sizeof(_x) / sizeof(_x[0])))
>> +#define for_each_test(_iter, _tests, _tmp) \
>> +     for (_tmp = 0, _iter = _tests; \
>> +                     _tmp < ARR_SIZE(_tests); \
>> +                     _tmp++, _iter++)
>> +
>> +#define CYCLE_COUNT(c1, c2) \
>> +     (((c1) > (c2) || ((c1) == (c2))) ? 0 : (c2) - (c1))
>
> Is my understanding correct that this is overflow protection?
> c1 and c2 are 64-bit values. To overflow them you need 58 years
> at 1G CPU freq.
>
> And anyway,
> #define CYCLE_COUNT(c1, c2)  ((c1) >= (c2) ? 0 : (c2) - (c1))

Yes, it's for overflow protection.

>
>> +
>> +#define IPI_DEBUG 0
>> +
>
> If you assume to pass IPI_DEBUG as GCC parameter, like -DIPI_DEBUG=1,
> it should be
>
> #ifndef IPI_DEBUG
> #define IPI_DEBUG 0
> #endif
>
> But I wonder if there's already existing switch for debug info?
>
>> +#if IPI_DEBUG == 1
>> +#define debug(fmt, ...) \
>> +     printf("[cpu %d]: " fmt, smp_processor_id(),  ## __VA_ARGS__)
>> +#else
>> +#define debug(fmt, ...) {}
>> +#endif
>
> There are some fancy printing functions in lib/report.c. I didn't
> look deeply into thet, but at first glance you can use it here.
>
>> +
>> +static uint64_t read_cc(void)
>> +{
>> +     uint64_t cc;
>> +     asm volatile(
>> +             "isb\n"
>> +             "mrs %0, CNTPCT_EL0\n"
>> +             "isb\n"
>> +             : [reg] "=r" (cc)
>> +             ::
>> +     );
>> +     return cc;
>> +}
>
> Flushing pipeline before mrs is enough. Also, you can use read_sysreg()
> instead of inline assembly. Refer arch_counter_get_cntpct() in kernel:
>
> 151 static inline u64 arch_counter_get_cntpct(void)
> 152 {
> 153         isb();
> 154         return arch_timer_reg_read_stable(cntpct_el0);
> 155 }
>
>> +
>> +static void ipi_irq_handler(struct pt_regs *regs __unused)
>> +{
>> +     u32 ack;
>> +     ipi_ready = false;
>> +     ipi_received = true;
>> +     ack = gic_read_iar();
>> +     ipi_acked = true;
>> +     gic_write_eoir(ack);
>> +     ipi_ready = true;
>> +}
>> +
>> +static void ipi_test_secondary_entry(void)
>> +{
>> +     unsigned int timeout = TIMEOUT;
>> +
>> +     debug("secondary core up\n");
>> +
>> +     enum vector v = EL1H_IRQ;
>> +     install_irq_handler(v, ipi_irq_handler);
>> +
>> +     gic_enable_defaults();
>> +
>> +     second_cpu_up = true;
>> +
>> +     debug("secondary initialized vgic\n");
>> +
>> +     while (!first_cpu_ack && timeout--);
>> +     if (!first_cpu_ack) {
>> +             debug("ipi_test: First CPU did not ack wake-up\n");
>> +             exit(1);
>> +     }
>
> Nit: here and later, timeout is not actually timeout, but something like
> counter of attempts. Maybe it worth to use read_cc() here if you need
> time intervals?
>
>> +
>> +     debug("detected first cpu ack\n");
>> +
>> +     local_irq_enable(); /* Enter small wait-loop */
>> +     ipi_ready = true;
>> +     while (true);
>> +}
>> +
>> +static int test_init(void)
>> +{
>> +     int ret;
>> +     unsigned int timeout = TIMEOUT;
>> +
>> +     ret = gic_init();
>> +     if (!ret) {
>> +             debug("No supported gic present, skipping tests...\n");
>> +             goto out;
>> +     }
>> +
>> +     ipi_ready = false;
>> +
>> +     gic_enable_defaults();
>> +
>> +     debug("starting second CPU\n");
>> +     smp_boot_secondary(1, ipi_test_secondary_entry);
>> +
>> +     while (!second_cpu_up && timeout--); /* Wait for second CPU! */
>> +
>> +     if (!second_cpu_up) {
>> +             debug("ipi_test: timeout waiting for secondary CPU\n");
>> +             ret = 0;
>> +             goto out;
>> +     }
>> +
>> +     debug("detected secondary core up\n");
>> +
>> +     first_cpu_ack = true;
>> +
>> +     printf("Timer Frequency %d Hz (Output in timer count)\n", get_cntfrq());
>> +
>> +out:
>
> Nit: it's simpler to return error at place and drop the 'out' label,
> like this:
> if (!gic_init()) {
>         debug(...);
>         return 0;
> }

Yes.

>
>> +     return ret;
>> +}
>> +
>> +static unsigned long ipi_test(void)
>> +{
>> +     unsigned int timeout = TIMEOUT;
>> +     unsigned long c1, c2;
>
> read_cc() returns uint64_t, so c1 and c2 should also be fixed-size
> variables. Maybe one day we'll port test to arm32 where unsigned long is
> 32-bit...

Right, I'll look at it.

>
>> +
>> +     while (!ipi_ready && timeout--);
>> +     if (!ipi_ready) {
>> +             debug("ipi_test: second core not ready for IPIs\n");
>> +             exit(1);
>> +     }
>> +
>> +     ipi_received = false;
>> +
>> +     c1 = read_cc();
>> +
>> +     gic_ipi_send_single(IPI_IRQ, 1);
>> +
>> +     timeout = TIMEOUT;
>> +     while (!ipi_received && timeout--);
>> +     if (!ipi_received) {
>> +             debug("ipi_test: secondary core never received ipi\n");
>> +             exit(1);
>> +     }
>> +
>> +     c2 = read_cc();
>> +     return CYCLE_COUNT(c1, c2);
>> +}
>> +
>> +
>
> Nit: odd line.
>
>> +static unsigned long hvc_test(void)
>> +{
>> +     unsigned long c1, c2;
>> +
>> +     c1 = read_cc();
>> +     asm volatile("mov w0, #0x4b000000; hvc #0");
>> +     c2 = read_cc();
>> +     return CYCLE_COUNT(c1, c2);
>> +}
>
> This will be broken if compiler decide to assign r0 for variable c1,
> or I miss something?
>
>> +
>> +static void __noop(void)
>> +{
>> +}
>
> This would be completely optimized out. Is it for demonstrative
> purpose?

Yes, this is to show the cost using isb().

>
>> +
>> +static unsigned long noop_guest(void)
>> +{
>> +     unsigned long c1, c2;
>> +
>> +     c1 = read_cc();
>> +     __noop();
>> +     c2 = read_cc();
>> +     return CYCLE_COUNT(c1, c2);
>> +}
>> +
>> +static unsigned long mmio_read_user(void)
>> +{
>> +     unsigned long c1, c2;
>> +     void *mmio_read_user_addr = (void*) 0x0a000008;
>> +
>> +     /* Measure MMIO exit to QEMU in userspace */
>> +     c1 = read_cc();
>> +     readl(mmio_read_user_addr);
>> +     c2 = read_cc();
>> +     return CYCLE_COUNT(c1, c2);
>> +}
>> +
>> +static unsigned long mmio_read_vgic(void)
>> +{
>> +     unsigned long c1, c2;
>> +     int v = gic_version();
>> +     void *vgic_dist_addr = NULL;
>> +
>> +     if (v == 2)
>> +             vgic_dist_addr = gicv2_dist_base();
>> +     else if (v == 3)
>> +             vgic_dist_addr = gicv3_dist_base();
>> +
>> +     /* Measure MMIO exit to host kernel */
>> +     c1 = read_cc();
>> +     readl(vgic_dist_addr + 0x8); /* Read GICD_IIDR */
>> +     c2 = read_cc();
>> +     return CYCLE_COUNT(c1, c2);
>> +}
>
> So if gic version is not 2 or 3, it will readl() at broken
> address. I think it's better to return error and print relevant
> message to system log.
>
>> +
>> +static unsigned long eoi_test(void)
>> +{
>> +     unsigned long c1, c2;
>> +     int v = gic_version();
>> +     void (*write_eoir)(u32 irqstat) = NULL;
>> +
>> +     u32 val = 1023; /* spurious IDs, writes to EOI are ignored */
>> +
>> +     if (v == 2)
>> +             write_eoir = gicv2_write_eoir;
>> +     else if (v == 3)
>> +             write_eoir = gicv3_write_eoir;
>> +
>> +     c1 = read_cc();
>> +     write_eoir(val);
>> +     c2 = read_cc();
>> +
>> +     return CYCLE_COUNT(c1, c2);
>> +}
>
> Similar here.
>
>> +
>> +struct exit_test {
>> +     const char *name;
>> +     unsigned long (*test_fn)(void);
>> +     bool run;
>> +};
>> +
>> +static struct exit_test available_tests[] = {
>> +     {"hvc",                hvc_test,           true},
>> +     {"noop_guest",         noop_guest,         true},
>> +     {"mmio_read_user",     mmio_read_user,     true},
>> +     {"mmio_read_vgic",     mmio_read_vgic,     true},
>> +     {"eoi",                eoi_test,           true},
>> +     {"ipi",                ipi_test,           true},
>> +};
>> +
>> +static void loop_test(struct exit_test *test)
>> +{
>> +     unsigned long i, iterations = 32;
>> +     unsigned long sample, cycles;
>> +     unsigned long long min = 0, max = 0;
>> +     const unsigned long long goal = (1ULL << 29);
>> +
>> +     do {
>> +             iterations *= 2;
>> +             cycles = 0;
>> +             for (i = 0; i < iterations; i++) {
>> +                     sample = test->test_fn();
>> +                     if (sample == 0) {
>> +                             /*
>> +                              * If something went wrong or we had an
>> +                              * overflow, don't count that sample.
>> +                              */
>> +                             iterations--;
>> +                             i--;
>> +                             debug("cycle count overflow: %lu\n", sample);
>> +                             continue;
>> +                     }
>> +                     cycles += sample;
>> +                     if (min == 0 || min > sample)
>> +                             min = sample;
>> +                     if (max < sample)
>> +                             max = sample;
>> +             }
>> +     } while (cycles < goal);
>> +     printf("%s:\t avg %lu\t min %llu\t max %llu\n",
>> +             test->name, cycles / (iterations), min, max);
>> +}
>> +
>> +void kvm_unit_test(void)
>> +{
>> +     int i=0;
>> +     struct exit_test *test;
>> +     for_each_test(test, available_tests, i) {
>> +             if (!test->run)
>> +                     continue;
>> +             loop_test(test);
>> +     }
>> +
>> +     return;
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> +     if (!test_init())
>> +             exit(1);
>> +     kvm_unit_test();
>> +     return 0;
>> +}
>> diff --git a/arm/unittests.cfg b/arm/unittests.cfg
>> index 44b98cf..1d0c4ca 100644
>> --- a/arm/unittests.cfg
>> +++ b/arm/unittests.cfg
>> @@ -116,3 +116,9 @@ file = timer.flat
>>  groups = timer
>>  timeout = 2s
>>  arch = arm64
>> +
>> +# Exit tests
>> +[micro-test]
>> +file = micro-test.flat
>> +smp = 2
>> +groups = micro-test
>> --
>> 1.9.1
>>