[Android-virt] [PATCH 05/15] ARM: KVM: VGIC virtual CPU interface management

c.dall at virtualopensystems.com (Christoffer Dall) · Thu, 21 Jun 2012 23:00:18 -0400

On Mon, May 14, 2012 at 9:05 AM, Marc Zyngier <marc.zyngier at arm.com> wrote:
> Add VGIC virtual CPU interface code, picking pending interrupts
> from the distributor and stashing them in the VGIC control interface
> list registers.
>
> Signed-off-by: Marc Zyngier <marc.zyngier at arm.com>
> ---
> ?arch/arm/include/asm/kvm_vgic.h | ? 30 +++++
> ?arch/arm/kvm/vgic.c ? ? ? ? ? ? | ?230 ++++++++++++++++++++++++++++++++++++++-
> ?2 files changed, 259 insertions(+), 1 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h
> index dca2d09..939f3f8 100644
> --- a/arch/arm/include/asm/kvm_vgic.h
> +++ b/arch/arm/include/asm/kvm_vgic.h
> @@ -106,14 +106,44 @@ struct vgic_dist {
> ?};
>
> ?struct vgic_cpu {
> +#ifdef CONFIG_KVM_ARM_VGIC
> + ? ? ? spinlock_t ? ? ?lock;
> +
> + ? ? ? u8 ? ? ? ? ? ? ?vgic_irq_lr_map[VGIC_NR_IRQS]; ?/* per IRQ to LR mapping */
> + ? ? ? u8 ? ? ? ? ? ? ?vgic_lr_irq_map[64]; ? ? ? ? ? ?/* per LR to IRQ mapping */

if someone changed the VGIC_NR_IRQS to higher than 256, you're in trouble here.

> + ? ? ? DECLARE_BITMAP( pending, VGIC_NR_IRQS);
> +
> + ? ? ? int ? ? ? ? ? ? nr_lr;
> +
> + ? ? ? /* CPU vif control registers for world switch */
> + ? ? ? u32 ? ? ? ? ? ? vgic_hcr;
> + ? ? ? u32 ? ? ? ? ? ? vgic_mcr;

why not vgic_vmcr to fit with documentation?

> + ? ? ? u32 ? ? ? ? ? ? vgic_misr; ? ? ?/* Saved only */
> + ? ? ? u32 ? ? ? ? ? ? vgic_elsr[2]; ? /* Saved only */
> + ? ? ? u32 ? ? ? ? ? ? vgic_apr;
> + ? ? ? u32 ? ? ? ? ? ? vgic_lr[64]; ? ?/* Silly, A15 has only 4... */
> +#endif
> ?};
>
> +#define VGIC_HCR_EN ? ? ? ? ? ?(1 << 0)
> +#define VGIC_HCR_UIE ? ? ? ? ? (1 << 1)
> +
> +#define VGIC_LR_PHYSID_CPUID ? (7 << 10)
> +#define VGIC_LR_STATE ? ? ? ? ?(3 << 28)
> +#define VGIC_LR_PENDING_BIT ? ?(1 << 28)
> +#define VGIC_LR_ACTIVE_BIT ? ? (1 << 29)
> +
> ?struct kvm;
> ?struct kvm_vcpu;
> ?struct kvm_run;
>
> ?#ifdef CONFIG_KVM_ARM_VGIC
> +void kvm_vgic_sync_to_cpu(struct kvm_vcpu *vcpu);
> +void kvm_vgic_sync_from_cpu(struct kvm_vcpu *vcpu);
> +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
> ?int vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run);
> +
> +#define irqchip_in_kernel(k) ? (!!((k)->arch.vgic.vctrl_base))
> ?#else
> ?static inline int kvm_vgic_hyp_init(void)
> ?{
> diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c
> index 1ace859..ccd8b69 100644
> --- a/arch/arm/kvm/vgic.c
> +++ b/arch/arm/kvm/vgic.c
> @@ -416,12 +416,35 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
>
> ?static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
> ?{
> - ? ? ? return 0;
> + ? ? ? struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> + ? ? ? unsigned long *pending, *enabled, *pend;
> + ? ? ? int vcpu_id;
> +
> + ? ? ? vcpu_id = vcpu->vcpu_id;
> + ? ? ? pend = vcpu->arch.vgic_cpu.pending;

why are we doing this work on the vgic structure? couldn't it just be
a local variable here? why is this pending value from last call
preserved?

> +
> + ? ? ? pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
> + ? ? ? enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
> + ? ? ? bitmap_and(pend, pending, enabled, 32);
> +
> + ? ? ? pending = dist->irq_pending.global.reg_ul;
> + ? ? ? enabled = dist->irq_enabled.global.reg_ul;
> + ? ? ? bitmap_and(pend + 1, pending, enabled, VGIC_NR_IRQS - 32);
> + ? ? ? bitmap_and(pend + 1, pend + 1, dist->irq_spi_target[vcpu_id].global.reg_ul,
> + ? ? ? ? ? ? ? ? ?VGIC_NR_IRQS - 32);
> +

so I think I prefer calling a function that creates the spi-target
bitmap when needed here and not have multiple copies of the same
state, but there might be a performance reason for doing this?

also, let me see if I understand the overall scheme here:

 - at any time, the dist->irq_pending_on_cpu is the oracle that knows if
   something is pending
 - vgic pending interrupts are stored on the vgic.irq_pending vgic
   bitmap (this bitmap is updated by both user land ioctls and guest
   mmio ops)
 - every time the bitmap changes, the irq_pending_on_cpu oracle is
   recalculated
 - to calculate the oracle, we need info for each cpu from
   compute_pending_for_cpu, which considers:
   - PPI: dist->irq_pending & dist->irq_enable
   - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
   - irq_spi_target is in essence a differently 'formatted' copy of
     irq_target, stored for each vcpu

Maybe we can try to document something like this here in some overall
high-level view somewhere. My returning concern here is the somewhat
duplicated content (irq_spi_target) and the missing feeling of a
single entry point that calculates the needed pending bits in a clear
maner, but I don't see a clear picture of improvement - at least not
yet.

> + ? ? ? return (find_first_bit(pend, VGIC_NR_IRQS) < VGIC_NR_IRQS);
> ?}
>
> ?/*
> ?* Update the interrupt state and determine which CPUs have pending
> ?* interrupts. Must be called with distributor lock held.
> + *
> + * It would be very tempting to just compute the pending bitmap once,
> + * but that would make it quite ugly locking wise when a vcpu actually
> + * moves the interrupt to its list registers (think of a single
> + * interrupt pending on several vcpus). So we end up computing the
> + * pending list twice (once here, and once in __kvm_vgic_sync_to_cpu).

I don't understand this comment (at least not yet). Why are we
calculating it here if we're calculating it in a different way, but
for the same, in another function. I guess I will try and find out.

> ?*/
> ?static void vgic_update_state(struct kvm *kvm)
> ?{
> @@ -443,3 +466,208 @@ static void vgic_update_state(struct kvm *kvm)
> ? ? ? ? ? ? ? ?}
> ? ? ? ?}
> ?}
> +
> +/*
> + * Queue an interrupt to a CPU virtual interface. Return 0 on success,
> + * or 1 if it wasn't possible to queue it.
> + */
> +static int kvm_gic_queue_irq(struct kvm_vcpu *vcpu, u8 cpuid, int irq)

why is this all of the sudden kvm_gic and not vgic_? Is it moving to
generic code?

nit: rename cpuid -> sgi_cpuid / sgi_source_id / ...

> +{
> + ? ? ? struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> + ? ? ? int lr = vgic_cpu->vgic_irq_lr_map[irq];
> +
> + ? ? ? pr_debug("Queue IRQ%d\n", irq);

nit: kvm_debug

> +
> + ? ? ? /* Sanitize cpuid... */
> + ? ? ? cpuid &= 7;

BUG_ON(cpuid & ~7); ?
BUG_ON(cpuid && irq > 15); ?
BUG_ON(irq >= VGIC_NR_IRQS); ?

> +
> + ? ? ? /* Do we have an active interrupt for the same CPUID? */
> + ? ? ? if (lr != 0xff &&

0xff ? magic number?
#define LR_EMPTY (0xff) ?

> + ? ? ? ? ? (vgic_cpu->vgic_lr[lr] & VGIC_LR_PHYSID_CPUID) == (cpuid << 10)) {
> + ? ? ? ? ? ? ? pr_debug("LR%d piggyback for IRQ%d %x\n", lr, irq, cpuid);
> + ? ? ? ? ? ? ? vgic_cpu->vgic_lr[lr] |= VGIC_LR_PENDING_BIT;
> + ? ? ? ? ? ? ? return 0;
> + ? ? ? }
> +
> + ? ? ? /* Try to use another LR for this interrupt */
> + ? ? ? lr = find_first_bit((unsigned long *)vgic_cpu->vgic_elsr,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?vgic_cpu->nr_lr);
> + ? ? ? if (lr >= vgic_cpu->nr_lr)
> + ? ? ? ? ? ? ? return 1;
> +
> + ? ? ? pr_debug("LR%d allocated for IRQ%d %x\n", lr, irq, cpuid);
> + ? ? ? vgic_cpu->vgic_lr[lr] = (VGIC_LR_PENDING_BIT | ?(cpuid << 10) | irq);

stray extra space

> + ? ? ? vgic_cpu->vgic_irq_lr_map[irq] = lr;
> + ? ? ? vgic_cpu->vgic_lr_irq_map[lr] = irq;

do we need this state or can we just read the VirtualID field of the LR here?

> + ? ? ? clear_bit(lr, (unsigned long *)vgic_cpu->vgic_elsr);
> +
> + ? ? ? return 0;
> +}
> +
> +/*
> + * Fill the list registers with pending interrupts before running the
> + * guest.
> + */
> +static void __kvm_vgic_sync_to_cpu(struct kvm_vcpu *vcpu)
> +{
> + ? ? ? struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> + ? ? ? struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> + ? ? ? unsigned long *pending;
> + ? ? ? int i, c, vcpu_id;
> + ? ? ? int overflow = 0;
> +
> + ? ? ? vcpu_id = vcpu->vcpu_id;
> +
> + ? ? ? /*
> + ? ? ? ?* We may not have any pending interrupt, or the interrupts
> + ? ? ? ?* may have been serviced from another vcpu. In all cases,
> + ? ? ? ?* move along.
> + ? ? ? ?*/
> + ? ? ? if (!kvm_vgic_vcpu_pending_irq(vcpu) ||
> + ? ? ? ? ? !compute_pending_for_cpu(vcpu)) {

I still don't see why there's a precomputed value here that we cannot
trust, so we recalculate it here (can we ever trust the precomputed
one?)

oh, but we can trust the precomputed value if there are no interrupts,
so we only have to recalculate if there are pending interrupts - what
is the explanation for this || here?

forgive my lack of understanding, but there's a lot to take in here.

> + ? ? ? ? ? ? ? pr_debug("CPU%d has no pending interrupt\n", vcpu->vcpu_id);
> + ? ? ? ? ? ? ? goto epilog;
> + ? ? ? }
> +
> + ? ? ? /* SGIs */
> + ? ? ? pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
> + ? ? ? for (i = find_first_bit(vgic_cpu->pending, 16);
> + ? ? ? ? ? ?i < 16;
> + ? ? ? ? ? ?i = find_next_bit(vgic_cpu->pending, 16, i + 1)) {

consider:

i = -1;
while ((i = find_next_bit(vgic_cpu->pending, 16, i + 1)) < 16) {
        ...
}

or

for (i = -1; i < 16; i = find_next_bit(vgic_cpu->pending, 16, i + 1)) {
        ...
}

or (perhaps the most readable, considering the nested usage, could be
added to generic code)

#define for_each_set_bit(pos, map, start, end) \
        for (....)

> + ? ? ? ? ? ? ? unsigned long sources;
> +
> + ? ? ? ? ? ? ? pr_debug("SGI%d on CPU%d\n", i, vcpu_id);

nit: kvm_debug

> + ? ? ? ? ? ? ? sources = dist->irq_sgi_sources[vcpu_id][i];
> + ? ? ? ? ? ? ? for (c = find_first_bit(&sources, 8);
> + ? ? ? ? ? ? ? ? ? ?c < 8;
> + ? ? ? ? ? ? ? ? ? ?c = find_next_bit(&sources, 8, c + 1)) {
> + ? ? ? ? ? ? ? ? ? ? ? if (kvm_gic_queue_irq(vcpu, c, i)) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? overflow = 1;
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? ? ? ? ? sources &= ~(1 << c);
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? if (!sources)
> + ? ? ? ? ? ? ? ? ? ? ? clear_bit(i, pending);
> +
> + ? ? ? ? ? ? ? dist->irq_sgi_sources[vcpu_id][i] = sources;

you could make sources a pointer and get rid of this line...

> + ? ? ? }
> +
> + ? ? ? /* PPIs */
> + ? ? ? for (i = find_next_bit(vgic_cpu->pending, 32, 16);
> + ? ? ? ? ? ?i < 32;
> + ? ? ? ? ? ?i = find_next_bit(vgic_cpu->pending, 32, i + 1)) {
> + ? ? ? ? ? ? ? if (kvm_gic_queue_irq(vcpu, 0, i)) {
> + ? ? ? ? ? ? ? ? ? ? ? overflow = 1;
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? clear_bit(i, pending);

so the reason we're clearing this bit here, is that we're 'moving' the
status from the extra pending list to the list registers, right?

could the whole approach be different, so we write directly into the
hardware list registers and only spill over into "extra" registers if
we don't have enough list registers, or perhaps deny the interrupt
injection and let user land store this state? (just thinking out crazy
thoughts here, as this is the time to do that I guess).

> + ? ? ? }
> +
> +
> + ? ? ? /* SPIs */
> + ? ? ? pending = dist->irq_pending.global.reg_ul;
> + ? ? ? for (i = find_next_bit(vgic_cpu->pending, VGIC_NR_IRQS, 32);
> + ? ? ? ? ? ?i < VGIC_NR_IRQS;
> + ? ? ? ? ? ?i = find_next_bit(vgic_cpu->pending, VGIC_NR_IRQS, i + 1)) {
> + ? ? ? ? ? ? ? if (kvm_gic_queue_irq(vcpu, 0, i)) {
> + ? ? ? ? ? ? ? ? ? ? ? overflow = 1;
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? }
> +
> + ? ? ? ? ? ? ? clear_bit(i - 32, pending);
> + ? ? ? }
> +
> +epilog:
> + ? ? ? if (overflow)
> + ? ? ? ? ? ? ? vgic_cpu->vgic_hcr |= VGIC_HCR_UIE;
> + ? ? ? else {
> + ? ? ? ? ? ? ? vgic_cpu->vgic_hcr &= ~VGIC_HCR_UIE;
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* We're about to run this VCPU, and we've consumed
> + ? ? ? ? ? ? ? ?* everything the distributor had in store for
> + ? ? ? ? ? ? ? ?* us. Claim we don't have anything pending. We'll
> + ? ? ? ? ? ? ? ?* adjust that if needed while exiting.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? atomic_clear_mask(1 << vcpu_id,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? (unsigned long *)&dist->irq_pending_on_cpu);
> + ? ? ? }
> +}
> +
> +/*
> + * Sync back the VGIC state after a guest run.
> + */
> +static void __kvm_vgic_sync_from_cpu(struct kvm_vcpu *vcpu)
> +{
> + ? ? ? struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> + ? ? ? struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> + ? ? ? int empty, pending;
> +
> + ? ? ? /* Clear mappings for empty LRs */
> + ? ? ? empty = find_first_bit((unsigned long *)vgic_cpu->vgic_elsr,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?vgic_cpu->nr_lr);
> + ? ? ? while (empty < vgic_cpu->nr_lr) {

another nice use case for the for_each_set_bit(...)

> + ? ? ? ? ? ? ? int lr = empty;
> + ? ? ? ? ? ? ? int irq = vgic_cpu->vgic_lr_irq_map[lr];
> +
> + ? ? ? ? ? ? ? if (irq < VGIC_NR_IRQS)
> + ? ? ? ? ? ? ? ? ? ? ? vgic_cpu->vgic_irq_lr_map[irq] = 0xff;

 = LR_EMPTY ?

> +
> + ? ? ? ? ? ? ? vgic_cpu->vgic_lr_irq_map[lr] = VGIC_NR_IRQS;

s/VGIC_NR_IRQS/LR_EMPTY/  ?

> + ? ? ? ? ? ? ? empty = find_next_bit((unsigned long *)vgic_cpu->vgic_elsr,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vgic_cpu->nr_lr, empty + 1);
> + ? ? ? }
> +
> + ? ? ? /* Check if we still have something up our sleeve... */
> + ? ? ? pending = find_first_zero_bit((unsigned long *)vgic_cpu->vgic_elsr,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vgic_cpu->nr_lr);
> + ? ? ? if (pending < vgic_cpu->nr_lr)
> + ? ? ? ? ? ? ? atomic_or(1 << vcpu->vcpu_id, &dist->irq_pending_on_cpu);

ah, so you here you make sure that something still pending causes
vcpu_runnable to be true, got it.

> +}
> +
> +void kvm_vgic_sync_to_cpu(struct kvm_vcpu *vcpu)
> +{
> + ? ? ? struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> + ? ? ? struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> +
> + ? ? ? if (!irqchip_in_kernel(vcpu->kvm))
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? spin_lock(&dist->lock);
> + ? ? ? spin_lock(&vgic_cpu->lock);
> + ? ? ? __kvm_vgic_sync_to_cpu(vcpu);
> + ? ? ? spin_unlock(&vgic_cpu->lock);
> + ? ? ? spin_unlock(&dist->lock);

Two spin_locks for every guest entry/exit. Can we not avoid this
somehow? (so much work so far to have a lock-less entry in the common
case).

> +
> + ? ? ? *__this_cpu_ptr(vgic_vcpus) = vcpu;
> +}
> +
> +void kvm_vgic_sync_from_cpu(struct kvm_vcpu *vcpu)
> +{
> + ? ? ? struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> + ? ? ? struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
> +
> + ? ? ? if (!irqchip_in_kernel(vcpu->kvm))
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? spin_lock(&dist->lock);
> + ? ? ? spin_lock(&vgic_cpu->lock);
> + ? ? ? __kvm_vgic_sync_from_cpu(vcpu);
> + ? ? ? spin_unlock(&vgic_cpu->lock);
> + ? ? ? spin_unlock(&dist->lock);
> +
> + ? ? ? *__this_cpu_ptr(vgic_vcpus) = NULL;
> +}
> +
> +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
> +{
> + ? ? ? struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
> +
> + ? ? ? if (!irqchip_in_kernel(vcpu->kvm))
> + ? ? ? ? ? ? ? return 0;
> +
> + ? ? ? return !!(atomic_read(&dist->irq_pending_on_cpu) & (1 << vcpu->vcpu_id));

if this gets changed to test_bit, you should creep just below the 81
characters width here ;)

> +}
> --
> 1.7.7.1
>
>
>
> _______________________________________________
> Android-virt mailing list
> Android-virt at lists.cs.columbia.edu
> https://lists.cs.columbia.edu/cucslists/listinfo/android-virt