[Android-virt] [PATCH 04/15] ARM: KVM: VGIC distributor handling

marc.zyngier at arm.com (Marc Zyngier) · Thu, 21 Jun 2012 15:20:40 +0100

On 21/06/12 00:02, Christoffer Dall wrote:
> On Mon, May 14, 2012 at 9:05 AM, Marc Zyngier <marc.zyngier at arm.com> wrote:
>> Add the GIC distributor emulation code. A number of the GIC features
>> are simply ignored as they are not required to boot a Linux guest.
>>
>> Signed-off-by: Marc Zyngier <marc.zyngier at arm.com>
>> ---
>>  arch/arm/include/asm/kvm_vgic.h |  101 +++++++++++++
>>  arch/arm/kvm/vgic.c             |  315 +++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 416 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h
>> index 34f5cc0..dca2d09 100644
>> --- a/arch/arm/include/asm/kvm_vgic.h
>> +++ b/arch/arm/include/asm/kvm_vgic.h
>> @@ -1,7 +1,108 @@
>>  #ifndef __ASM_ARM_KVM_VGIC_H
>>  #define __ASM_ARM_KVM_VGIC_H
>>
>> +#include <linux/kernel.h>
>> +#include <linux/kvm.h>
>> +#include <linux/irqreturn.h>
>> +#include <linux/spinlock.h>
>> +#include <linux/types.h>
>> +
>> +#define VGIC_NR_IRQS   128           /* Arbitrary number */
> 
> perhaps mention that the struct definitions break if this is not a power of two.
> 
> perhaps also mention (or check like below) that the number of maximum
> IRQs are 1024 (even though max must value is 1019 for non-reserved.

OK.

> 
>> +#define VGIC_MAX_CPUS  KVM_MAX_VCPUS /* Same as the HW GIC */
>> +
>> +#if (VGIC_MAX_CPUS > 8)
>> +#error Invalid number of CPU interfaces
>> +#endif
>> +
>> +/*
>> + * The GIC registers describing interrupts have two parts:
> 
> these are specifically GIC distributor registers right?

Yes.

>> + * - 32 per-CPU interrupts (SGI + PPI)
>> + * - a bunch of global interrups (SPI)
>> + * They can have 1, 2 or 8 bit fields. Make it easier by having some template
>> + * to create the structures and the accessors.
>> + */
>> +#define DEFINE_VGIC_MAP_STRUCT(typename, size)                           \
>> +struct typename {                                                        \
>> +       union {                                                           \
>> +               u32 reg[32 / (sizeof(u32) * 8 / size)];                   \
> 
> 32 / (sizeof(u32) * 8 / size) == size
> 
> this should be pretty much no matter the architecture, compiler, etc. No?

Duh! Nice one.

> 
>> +               unsigned long reg_ul[0];                                  \
>> +       } percpu[VGIC_MAX_CPUS];                                          \
>> +       union {                                                           \
>> +               u32 reg[(VGIC_NR_IRQS - 32) / (sizeof(u32) * 8 / size)];  \
> 
> consider defining VGIC_NR_GLOB_IRQS (VGIC_NR_IRQS - 32)
> and do:
> 
> u32 reg[(VGIC_N_GLOB_IRQS * size) / sizeof(u32)];

OK.

>> +               unsigned long reg_ul[0];                                  \
>> +       } global;                                                         \
>> +};                                                                       \
>> +static inline u32 *typename##_get_reg(struct typename *x,                \
>> +                                     int cpuid, u32 offset)              \
>> +{                                                                        \
>> +       static const int irq_per_u32 = sizeof(u32) * 8 / size;            \
>> +       static const int glob_offset = 32 / irq_per_u32;                  \
>> +       offset >>= 2;                                                     \
>> +       BUG_ON(offset > (VGIC_NR_IRQS  / irq_per_u32));                   \
>> +       if (offset < glob_offset)                                         \
>> +               return x->percpu[cpuid].reg + offset;                     \
>> +       else                                                              \
>> +               return x->global.reg + offset - glob_offset;              \
>> +}                                                                        \
>> +static inline int typename##_get_irq_val(struct typename *x,             \
>> +                                        int cpuid, int irq)              \
>> +{                                                                        \
>> +       static const int irq_per_u32 = sizeof(u32) * 8 / size;            \
>> +       static const u32 mask = (1 << size) - 1;                          \
>> +       u32 *reg, offset, shift;                                          \
>> +       offset = (irq / irq_per_u32) << 2;                                \
>> +       shift = (irq % irq_per_u32) * size;                               \
>> +       reg = typename##_get_reg(x, cpuid, offset);                       \
>> +       return (*reg >> shift) & mask;                                    \
>> +}                                                                        \
>> +static inline void typename##_set_irq_val(struct typename *x,            \
>> +                                        int cpuid, int irq, int val)     \
>> +{                                                                        \
>> +       static const int irq_per_u32 = sizeof(u32) * 8 / size;            \
>> +       static const u32 mask = (1 << size) - 1;                          \
>> +       u32 *reg, offset, shift;                                          \
>> +       offset = (irq / irq_per_u32) << 2;                                \
>> +       shift = (irq % irq_per_u32) * size;                               \
>> +       reg = typename##_get_reg(x, cpuid, offset);                       \
>> +       *reg &= ~(mask << shift);                                         \
>> +       *reg |= (val & mask) << shift;                                    \
>> +}                                                                        \
>> +static inline unsigned long *typename##_get_cpu_map(struct typename *x,          \
>> +                                                   int cpu_id)           \
>> +{                                                                        \
>> +       if (unlikely(cpu_id >= VGIC_MAX_CPUS))                            \
>> +               return NULL;                                              \
>> +       return x->percpu[cpu_id].reg_ul;                                  \
>> +}
> 
> this macro makes me want to cry a little :)

[snip]

OK, point taken. I'll have a look at reworking the macro mess, but will
focus on the rest of the review for the time being.

>>  struct vgic_dist {
>> +#ifdef CONFIG_KVM_ARM_VGIC
>> +       spinlock_t              lock;
>> +
>> +       void __iomem            *vctrl_base;
>> +
>> +       u32                     enabled;
>> +
>> +       struct vgic_bitmap      irq_enabled;
>> +       struct vgic_bitmap      irq_pending;
>> +       struct vgic_bitmap      irq_active; /* Not used yet. Useful? */
> 
> are these not only used to preserve the GIC state for
> migration/powerloss and therefore not relevant when already held in
> main memory? If set to pending, figuring out if the interrupt is
> active, I guess we can just read the List registers, right?

We could indeed remove it, and just ignores the writes to this registers.

>> +
>> +       struct vgic_bytemap     irq_priority;/* Not used yet. Useful? */
> 
> if we don't use these fields and have no way of testing them, I
> suggest removing them and if we need to remember that they could go
> here, there could be a comment on the structure itself.

It could be used to restore a KVM-based VM to a QEMU-based system. If we
don't plan to support this, I'll just nuke the field.

> 
>> +       struct vgic_bytemap     irq_target;
>> +
>> +       struct vgic_2bitmap     irq_cfg; /* Not used yet. Useful? */
> 
> if it's implementation defined if this is supported for any PPI then
> it seems unlikely we will ever have guests running that relies on the
> fact that some PPIs can be configured to be edge-triggered, Linux uses
> level-triggered interrupts for everything it can program anyway, iirc.
> 
> if we take this out we can save all the 2bitmap stuff above as well right?

Yes, same as above.

>> +
>> +       u8                      irq_sgi_sources[VGIC_MAX_CPUS][16];
>> +
>> +       struct vgic_bitmap      irq_spi_target[VGIC_MAX_CPUS];
> 
> why do you need a per-cpu view of something banked per-cpu?

We're loosing a bit of memory here (one u32 per CPU). Not really a big
deal, but could be optimized if necessary.

>> +
>> +       atomic_t                irq_pending_on_cpu;
>> +#endif
>>  };
>>
>>  struct vgic_cpu {
>> diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c
>> index f7856a9..1ace859 100644
>> --- a/arch/arm/kvm/vgic.c
>> +++ b/arch/arm/kvm/vgic.c
>> @@ -21,6 +21,10 @@
>>  #include <linux/interrupt.h>
>>  #include <linux/io.h>
>>
>> +/* Temporary hacks, need to probe DT instead */
>> +#define VGIC_DIST_BASE         0x2c001000
>> +#define VGIC_DIST_SIZE         0x1000
>> +
> 
> what's involved in fixing this now? I assume this should come from
> userspace who may read it off the supplied DT or?

It is still something we have to work out with Peter. Probably involve
playing with PERIPHBASE instead of the DT. Will have to come from
userspace anyway.

>>  #define ACCESS_READ_VALUE      (1 << 0)
>>  #define ACCESS_READ_RAZ                (0 << 0)
>>  #define ACCESS_READ_MASK(x)    ((x) & (1 << 0))
>> @@ -30,6 +34,9 @@
>>  #define ACCESS_WRITE_VALUE     (3 << 1)
>>  #define ACCESS_WRITE_MASK(x)   ((x) & (3 << 1))
>>
>> +static void vgic_update_state(struct kvm *kvm);
>> +static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
>> +
>>  static void mmio_do_copy(struct kvm_run *run, u32 *reg, u32 offset, int mode)
>>  {
>>        int u32off = offset & 3;
>> @@ -83,6 +90,182 @@ static void mmio_do_copy(struct kvm_run *run, u32 *reg, u32 offset, int mode)
>>        }
>>  }
>>
>> +static void handle_mmio_misc(struct kvm_vcpu *vcpu,
>> +                            struct kvm_run *run, u32 offset)
>> +{
>> +       u32 reg;
>> +       u32 u32off = offset & 3;
>> +
>> +       switch (offset & ~3) {
>> +       case 0:                 /* CTLR */
>> +               reg = vcpu->kvm->arch.vgic.enabled;
>> +               mmio_do_copy(run, &reg, u32off,
>> +                            ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
>> +               if (run->mmio.is_write) {
>> +                       vcpu->kvm->arch.vgic.enabled = reg & 1;
>> +                       vgic_update_state(vcpu->kvm);
>> +               }
>> +               break;
>> +
>> +       case 4:                 /* TYPER */
>> +               reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
>> +               reg |= (VGIC_NR_IRQS >> 5) - 1;
>> +               mmio_do_copy(run, &reg, u32off,
>> +                            ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +               break;
>> +
>> +       case 8:                 /* IIDR */
>> +               reg = 0x4B00043B;
> 
> shouldn't this be keyed off something or is this fixed for the GICv2
> something ARM specific?

0x4B is 'K'. 0x43B is the code for ARM. I read it as KVM/ARM, but that's
only me...

>> +               mmio_do_copy(run, &reg, u32off,
>> +                            ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +               break;
>> +       }
>> +}
>> +
>> +static void handle_mmio_group_reg(struct kvm_vcpu *vcpu,
>> +                                 struct kvm_run *run, u32 offset)
>> +{
>> +       mmio_do_copy(run, NULL, offset,
>> +                    ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
>> +}
>> +
>> +static void handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
>> +                                      struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled,
>> +                                      vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
>> +       if (run->mmio.is_write)
>> +               vgic_update_state(vcpu->kvm);
>> +}
>> +
>> +static void handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
>> +                                        struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled,
>> +                                      vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
>> +       if (run->mmio.is_write && offset < 4) /* Force SGI enabled */
>> +               *reg |= 0xffff;
> 
> why no vgic_update_state(vcpu->kvm) here?

Because I always like to leave tiny little silly bugs behind me... ;-)
Thanks for pointing this out.

>> +}
>> +
>> +static void handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
>> +                                       struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_pending,
>> +                                      vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
>> +       if (run->mmio.is_write)
>> +               vgic_update_state(vcpu->kvm);
>> +}
>> +
>> +static void handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
>> +                                         struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_pending,
>> +                                      vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
> 
> why no vgic_update_state(vcpu->kvm) here?

For the same reason as above, and for the sake of cut-n-paste programming.

>> +}
>> +
>> +static void handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
>> +                                      struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_active,
>> +                                      vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
>> +}
>> +
>> +static void handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
>> +                                        struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_active,
>> +                                      vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
>> +}
> 
> if these active register read/writes should be here, shouldn't they
> talk to the List registers (or whatever we use to back these values -
> haven't read that part yet)?

As agreed above, I'll convert these to WI/RAZ.

>> +
>> +static void handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
>> +                                    struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
>> +                                       vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> 
> shouldn't this call vgic_update_state(vcpu->kvm) to check the running
> priority of the virtual cpu interface?

No, we do not take priority into account to deliver an interrupt, as
Linux doesn't use this. Can be introduced later if necessary.

>> +}
>> +
>> +static void update_spi_target(struct kvm *kvm)
>> +{
>> +       struct vgic_dist *dist = &kvm->arch.vgic;
>> +       int c, i, nrcpus = atomic_read(&kvm->online_vcpus);
>> +       u8 targ;
>> +       unsigned long *bmap;
>> +
>> +       for (i = 32; i < VGIC_NR_IRQS; i++) {
>> +               targ = vgic_bytemap_get_irq_val(&dist->irq_target, 0, i);
>> +
> 
> ah, I see, cpu=0 is because we're always using i >= 32.

Yeah, it's an ugly hack.

>> +               for (c = 0; c < nrcpus; c++) {
>> +                       bmap = dist->irq_spi_target[c].global.reg_ul;
>> +
>> +                       if (targ & (1 << c))
>> +                               set_bit(i - 32, bmap);
>> +                       else
>> +                               clear_bit(i - 32, bmap);
>> +               }
>> +       }
>> +}
> 
> I'm reading this as an optimization so you can easily determine if a
> specific VCPU should get the SPI? Or am I getting this entirely wrong?

It is a lot more than just an optimization. We really need to know which
CPU is getting targeted by a specific interrupt.

> At what time is this performance critical operation performed?

Only when writing to the target registers, which is hardly a performance
critical thing.

> Perhaps a comment would be very helpful. This makes me wonder, doesn't
> Linux just configure cpu 0 to take all physical interrupts or did I
> glance over this code too soon? Do we have any reason for this
> complexity for VMs at this point or could we just pretend that ll
> physical SPIs were statically assigned to CPU0? Performance issues?

You can change the affinity of an interrupt at any time. The fact that
Linux routes everything on CPU0 by default is just a default behavior. A
short example while running an SMP VM:

root at linaro:~# grep eth0 /proc/interrupts
 47:         20          0       GIC  eth0
root at linaro:~# cat /proc/irq/47/smp_affinity
3
root at linaro:~# echo 2 > /proc/irq/47/smp_affinity
root at linaro:~# ping 10.0.2.2
PING 10.0.2.2 (10.0.2.2) 56(84) bytes of data.
64 bytes from 10.0.2.2: icmp_req=1 ttl=255 time=5.96 ms
64 bytes from 10.0.2.2: icmp_req=2 ttl=255 time=0.834 ms
64 bytes from 10.0.2.2: icmp_req=3 ttl=255 time=0.818 ms
^C
--- 10.0.2.2 ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2002ms
rtt min/avg/max/mdev = 0.818/2.540/5.968/2.423 ms
root at linaro:~# grep eth0 /proc/interrupts
 47:         20          3       GIC  eth0

>> +
>> +static void handle_mmio_target_reg(struct kvm_vcpu *vcpu,
>> +                                  struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg;
>> +
>> +       /* We treat the banked interrupts targets as read-only */
>> +       if (offset < 32) {
> 
> why is this < 32 ? shouldn't it be < 8 ?

This is organized as one byte per interrupt. Each bit in the byte is
representing a CPU.

>> +               u32 roreg = 1 << vcpu->vcpu_id;
>> +               roreg |= roreg << 8;
>> +               roreg |= roreg << 16;
>> +
>> +               mmio_do_copy(run, &roreg, offset,
>> +                            ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
>> +               return;
>> +       }
>> +
>> +       reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_target,
>> +                                  vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
>> +       if (run->mmio.is_write) {
>> +               update_spi_target(vcpu->kvm);
>> +               vgic_update_state(vcpu->kvm);
>> +       }
>> +}
>> +
>> +static void handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
>> +                               struct kvm_run *run, u32 offset)
>> +{
>> +       u32 *reg = vgic_2bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
>> +                                       vcpu->vcpu_id, offset);
>> +       mmio_do_copy(run, reg, offset,
>> +                    ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
> 
> shouldn't we make sure that SGIs are RAO here?

Indeed.

> otherwise, could we just do RAZ/WI here?

Only if we discard the idea of migrating a VM to a non KVM environment.

>> +}
>> +
>> +static void handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
>> +                               struct kvm_run *run, u32 offset)
>> +{
>> +       u32 reg;
>> +       mmio_do_copy(run, &reg, offset,
>> +                    ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
>> +       if (run->mmio.is_write) {
>> +               vgic_dispatch_sgi(vcpu, reg);
>> +               vgic_update_state(vcpu->kvm);
>> +       }
>> +}
>> +
>>  /* All this should really be generic code... FIXME!!! */
>>  struct mmio_range {
>>        unsigned long base;
>> @@ -92,6 +275,66 @@ struct mmio_range {
>>  };
>>
>>  static const struct mmio_range vgic_ranges[] = {
>> +       {                       /* CTRL, TYPER, IIDR */
>> +               .base           = VGIC_DIST_BASE,
>> +               .len            = 12,
>> +               .handle_mmio    = handle_mmio_misc,
>> +       },
>> +       {                       /* IGROUPRn */
>> +               .base           = VGIC_DIST_BASE + 0x80,
>> +               .len            = VGIC_NR_IRQS / 8,
> 
> what happens if this is not byte aligned and the guest reads the
> entire register, then find_matching_range will not find this - is this
> a supported situation?

I'm not sure I understand what you mean here...

> 
>> +               .handle_mmio    = handle_mmio_group_reg,
>> +       },
>> +       {                       /* ISENABLERn */
>> +               .base           = VGIC_DIST_BASE + 0x100,
>> +               .len            = VGIC_NR_IRQS / 8,
>> +               .handle_mmio    = handle_mmio_set_enable_reg,
>> +       },
>> +       {                       /* ICENABLERn */
>> +               .base           = VGIC_DIST_BASE + 0x180,
>> +               .len            = VGIC_NR_IRQS / 8,
>> +               .handle_mmio    = handle_mmio_clear_enable_reg,
>> +       },
>> +       {                       /* ISPENDRn */
>> +               .base           = VGIC_DIST_BASE + 0x200,
>> +               .len            = VGIC_NR_IRQS / 8,
>> +               .handle_mmio    = handle_mmio_set_pending_reg,
>> +       },
>> +       {                       /* ICPENDRn */
>> +               .base           = VGIC_DIST_BASE + 0x280,
>> +               .len            = VGIC_NR_IRQS / 8,
>> +               .handle_mmio    = handle_mmio_clear_pending_reg,
>> +       },
>> +       {                       /* ISACTIVERn */
>> +               .base           = VGIC_DIST_BASE + 0x300,
>> +               .len            = VGIC_NR_IRQS / 8,
>> +               .handle_mmio    = handle_mmio_set_active_reg,
>> +       },
>> +       {                       /* ICACTIVERn */
>> +               .base           = VGIC_DIST_BASE + 0x380,
>> +               .len            = VGIC_NR_IRQS / 8,
>> +               .handle_mmio    = handle_mmio_clear_active_reg,
>> +       },
>> +       {                       /* IPRIORITYRn */
>> +               .base           = VGIC_DIST_BASE + 0x400,
>> +               .len            = VGIC_NR_IRQS,
>> +               .handle_mmio    = handle_mmio_priority_reg,
>> +       },
>> +       {                       /* ITARGETSRn */
>> +               .base           = VGIC_DIST_BASE + 0x800,
>> +               .len            = VGIC_NR_IRQS,
>> +               .handle_mmio    = handle_mmio_target_reg,
>> +       },
>> +       {                       /* ICFGRn */
>> +               .base           = VGIC_DIST_BASE + 0xC00,
>> +               .len            = VGIC_NR_IRQS / 4,
>> +               .handle_mmio    = handle_mmio_cfg_reg,
>> +       },
>> +       {                       /* SGIRn */
>> +               .base           = VGIC_DIST_BASE + 0xF00,
>> +               .len            = 4,
>> +               .handle_mmio    = handle_mmio_sgi_reg,
>> +       },
>>        {}
>>  };
>>
>> @@ -121,10 +364,82 @@ int vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>        if (!range || !range->handle_mmio)
>>                return KVM_EXIT_MMIO;
>>
>> +       spin_lock(&vcpu->kvm->arch.vgic.lock);
>>        pr_debug("emulating %d %08llx %d\n", run->mmio.is_write,
>>                 run->mmio.phys_addr, run->mmio.len);
>>        range->handle_mmio(vcpu, run, run->mmio.phys_addr - range->base);
>>        kvm_handle_mmio_return(vcpu, run);
>> +       spin_unlock(&vcpu->kvm->arch.vgic.lock);
>>
>>        return KVM_EXIT_UNKNOWN;
>>  }
>> +
>> +static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
>> +{
>> +       struct kvm *kvm = vcpu->kvm;
>> +       struct vgic_dist *dist = &kvm->arch.vgic;
>> +       int nrcpus = atomic_read(&kvm->online_vcpus);
>> +       u8 target_cpus;
>> +       int sgi, mode, c, vcpu_id;
>> +
>> +       vcpu_id = vcpu->vcpu_id;
>> +
>> +       sgi = reg & 0xf;
>> +       target_cpus = (reg >> 16) & 0xff;
> 
> no worries about the NSATT bit here? I'm a little fuzzy about the
> interrupt grouping and how that works with a guest always (or maybe
> not thinking so?) running in non-secure mode.

The notion of group only makes sense in secure mode. In non-secure mode,
you only see the interrupts that are flagged as group-1 by the secure
mode. These interrupts appear as group-0 on the non-secure side.

>> +       mode = (reg >> 24) & 3;
>> +
>> +       switch (mode) {
>> +       case 0:
>> +               if (!target_cpus)
>> +                       return;
>> +
>> +       case 1:
>> +               target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
>> +               break;
>> +
>> +       case 2:
>> +               target_cpus = 1 << vcpu_id;
>> +               break;
>> +       }
>> +
>> +       for (c = 0; c < nrcpus; c++) {
>> +               if (target_cpus & 1) {
>> +                       /* Flag the SGI as pending */
>> +                       vgic_bitmap_set_irq_val(&dist->irq_pending, c, sgi, 1);
>> +                       dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id;
>> +                       pr_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
> 
> should this not rather be a trace event?

Possibly. I haven't looked at the trace events yet. Any suggestion?

>> +               }
>> +
>> +               target_cpus >>= 1;
>> +       }
>> +}
>> +
>> +static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
>> +{
>> +       return 0;
>> +}
>> +
>> +/*
>> + * Update the interrupt state and determine which CPUs have pending
>> + * interrupts. Must be called with distributor lock held.
>> + */
>> +static void vgic_update_state(struct kvm *kvm)
>> +{
>> +       struct vgic_dist *dist = &kvm->arch.vgic;
>> +       int nrcpus = atomic_read(&kvm->online_vcpus);
>> +       int c;
>> +
>> +       if (!dist->enabled) {
>> +               atomic_set(&dist->irq_pending_on_cpu, 0);
>> +               return;
>> +       }
>> +
>> +       for (c = 0; c < nrcpus; c++) {
>> +               struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, c);
>> +
>> +               if (compute_pending_for_cpu(vcpu)) {
>> +                       pr_debug("CPU%d has pending interrupts\n", c);
>> +                       atomic_or((1 << c), &dist->irq_pending_on_cpu);
> 
> this may be simpler with set_bit(c, ....) and have pending_on_cpu be
> an unsigned long. not important.

Indeed.

Thanks for the very interesting review.

	M.
-- 
Jazz is not dead. It just smells funny...