On Sat, Nov 10, 2012 at 03:44:58PM +0000, Christoffer Dall wrote: > From: Marc Zyngier <marc.zyngier@xxxxxxx> > > Add the GIC distributor emulation code. A number of the GIC features > are simply ignored as they are not required to boot a Linux guest. > > Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx> > Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> > --- > arch/arm/include/asm/kvm_vgic.h | 167 ++++++++++++++ > arch/arm/kvm/vgic.c | 471 +++++++++++++++++++++++++++++++++++++++ > 2 files changed, 637 insertions(+), 1 deletion(-) > > diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h > index 9ca8d21..9e60b1d 100644 > --- a/arch/arm/include/asm/kvm_vgic.h > +++ b/arch/arm/include/asm/kvm_vgic.h > @@ -19,10 +19,177 @@ > #ifndef __ASM_ARM_KVM_VGIC_H > #define __ASM_ARM_KVM_VGIC_H > > +#include <linux/kernel.h> > +#include <linux/kvm.h> > +#include <linux/kvm_host.h> > +#include <linux/irqreturn.h> > +#include <linux/spinlock.h> > +#include <linux/types.h> > + > +#define VGIC_NR_IRQS 128 #define VGIC_NR_PRIVATE_IRQS 32? > +#define VGIC_NR_SHARED_IRQS (VGIC_NR_IRQS - 32) then subtract it here > +#define VGIC_MAX_CPUS NR_CPUS We already have KVM_MAX_VCPUS, why do we need another? > + > +/* Sanity checks... */ > +#if (VGIC_MAX_CPUS > 8) > +#error Invalid number of CPU interfaces > +#endif > + > +#if (VGIC_NR_IRQS & 31) > +#error "VGIC_NR_IRQS must be a multiple of 32" > +#endif > + > +#if (VGIC_NR_IRQS > 1024) > +#error "VGIC_NR_IRQS must be <= 1024" > +#endif Maybe put each check directly below the #define being tested, to make it super-obvious to people thinking of changing the constants? > +/* > + * The GIC distributor registers describing interrupts have two parts: > + * - 32 per-CPU interrupts (SGI + PPI) > + * - a bunch of shared interrups (SPI) interrupts > + */ > +struct vgic_bitmap { > + union { > + u32 reg[1]; > + unsigned long reg_ul[0]; > + } percpu[VGIC_MAX_CPUS]; > + union { > + u32 reg[VGIC_NR_SHARED_IRQS / 32]; > + unsigned long reg_ul[0]; > + } shared; > +}; Whoa, this is nasty! Firstly, let's replace the `32' with sizeof(u32) for fun. Secondly, can we make the reg_ul arrays sized using the BITS_TO_LONGS macro? > + > +static inline u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, > + int cpuid, u32 offset) > +{ > + offset >>= 2; > + BUG_ON(offset > (VGIC_NR_IRQS / 32)); Hmm, where is offset sanity-checked before here? Do you just rely on all trapped accesses being valid? > + if (!offset) > + return x->percpu[cpuid].reg; > + else > + return x->shared.reg + offset - 1; An alternative to this would be to have a single array, with the per-cpu interrupts all laid out at the start and a macro to convert an offset to an index. Might make the code more readable and the struct definition more concise. > +} > + > +static inline int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, > + int cpuid, int irq) > +{ > + if (irq < 32) VGIC_NR_PRIVATE_IRQS (inless you go with the suggestion above) > + return test_bit(irq, x->percpu[cpuid].reg_ul); > + > + return test_bit(irq - 32, x->shared.reg_ul); > +} > + > +static inline void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, > + int cpuid, int irq, int val) > +{ > + unsigned long *reg; > + > + if (irq < 32) > + reg = x->percpu[cpuid].reg_ul; > + else { > + reg = x->shared.reg_ul; > + irq -= 32; > + } Likewise. > + > + if (val) > + set_bit(irq, reg); > + else > + clear_bit(irq, reg); > +} > + > +static inline unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, > + int cpuid) > +{ > + if (unlikely(cpuid >= VGIC_MAX_CPUS)) > + return NULL; > + return x->percpu[cpuid].reg_ul; > +} > + > +static inline unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) > +{ > + return x->shared.reg_ul; > +} > + > +struct vgic_bytemap { > + union { > + u32 reg[8]; > + unsigned long reg_ul[0]; > + } percpu[VGIC_MAX_CPUS]; > + union { > + u32 reg[VGIC_NR_SHARED_IRQS / 4]; > + unsigned long reg_ul[0]; > + } shared; > +}; Argh, it's another one! :) > + > +static inline u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, > + int cpuid, u32 offset) > +{ > + offset >>= 2; > + BUG_ON(offset > (VGIC_NR_IRQS / 4)); > + if (offset < 4) > + return x->percpu[cpuid].reg + offset; > + else > + return x->shared.reg + offset - 8; > +} > + > +static inline int vgic_bytemap_get_irq_val(struct vgic_bytemap *x, > + int cpuid, int irq) > +{ > + u32 *reg, shift; > + shift = (irq & 3) * 8; > + reg = vgic_bytemap_get_reg(x, cpuid, irq); > + return (*reg >> shift) & 0xff; > +} > + > +static inline void vgic_bytemap_set_irq_val(struct vgic_bytemap *x, > + int cpuid, int irq, int val) > +{ > + u32 *reg, shift; > + shift = (irq & 3) * 8; > + reg = vgic_bytemap_get_reg(x, cpuid, irq); > + *reg &= ~(0xff << shift); > + *reg |= (val & 0xff) << shift; > +} > + > struct vgic_dist { > +#ifdef CONFIG_KVM_ARM_VGIC > + spinlock_t lock; > + > + /* Virtual control interface mapping */ > + void __iomem *vctrl_base; > + > /* Distributor and vcpu interface mapping in the guest */ > phys_addr_t vgic_dist_base; > phys_addr_t vgic_cpu_base; > + > + /* Distributor enabled */ > + u32 enabled; > + > + /* Interrupt enabled (one bit per IRQ) */ > + struct vgic_bitmap irq_enabled; > + > + /* Interrupt 'pin' level */ > + struct vgic_bitmap irq_state; > + > + /* Level-triggered interrupt in progress */ > + struct vgic_bitmap irq_active; > + > + /* Interrupt priority. Not used yet. */ > + struct vgic_bytemap irq_priority; What would the bitmap component of the bytemap represent for priorities? > + > + /* Level/edge triggered */ > + struct vgic_bitmap irq_cfg; > + > + /* Source CPU per SGI and target CPU */ > + u8 irq_sgi_sources[VGIC_MAX_CPUS][16]; Ah, I guess my VGIC_NR_PRIVATE_IRQS interrupt should be further divided... > + /* Target CPU for each IRQ */ > + u8 irq_spi_cpu[VGIC_NR_SHARED_IRQS]; > + struct vgic_bitmap irq_spi_target[VGIC_MAX_CPUS]; > + > + /* Bitmap indicating which CPU has something pending */ > + unsigned long irq_pending_on_cpu; > +#endif > }; > > struct vgic_cpu { > diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c > index f85b275..82feee8 100644 > --- a/arch/arm/kvm/vgic.c > +++ b/arch/arm/kvm/vgic.c > @@ -22,6 +22,42 @@ > #include <linux/io.h> > #include <asm/kvm_emulate.h> > > +/* > + * How the whole thing works (courtesy of Christoffer Dall): > + * > + * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if > + * something is pending > + * - VGIC pending interrupts are stored on the vgic.irq_state vgic > + * bitmap (this bitmap is updated by both user land ioctls and guest > + * mmio ops) and indicate the 'wire' state. > + * - Every time the bitmap changes, the irq_pending_on_cpu oracle is > + * recalculated > + * - To calculate the oracle, we need info for each cpu from > + * compute_pending_for_cpu, which considers: > + * - PPI: dist->irq_state & dist->irq_enable > + * - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target > + * - irq_spi_target is a 'formatted' version of the GICD_ICFGR > + * registers, stored on each vcpu. We only keep one bit of > + * information per interrupt, making sure that only one vcpu can > + * accept the interrupt. > + * - The same is true when injecting an interrupt, except that we only > + * consider a single interrupt at a time. The irq_spi_cpu array > + * contains the target CPU for each SPI. > + * > + * The handling of level interrupts adds some extra complexity. We > + * need to track when the interrupt has been EOIed, so we can sample > + * the 'line' again. This is achieved as such: > + * > + * - When a level interrupt is moved onto a vcpu, the corresponding > + * bit in irq_active is set. As long as this bit is set, the line > + * will be ignored for further interrupts. The interrupt is injected > + * into the vcpu with the VGIC_LR_EOI bit set (generate a > + * maintenance interrupt on EOI). > + * - When the interrupt is EOIed, the maintenance interrupt fires, > + * and clears the corresponding bit in irq_active. This allow the > + * interrupt line to be sampled again. > + */ > + > #define VGIC_ADDR_UNDEF (-1) > #define IS_VGIC_ADDR_UNDEF(_x) ((_x) == (typeof(_x))VGIC_ADDR_UNDEF) > > @@ -38,6 +74,14 @@ > #define ACCESS_WRITE_VALUE (3 << 1) > #define ACCESS_WRITE_MASK(x) ((x) & (3 << 1)) > > +static void vgic_update_state(struct kvm *kvm); > +static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); > + > +static inline int vgic_irq_is_edge(struct vgic_dist *dist, int irq) > +{ > + return vgic_bitmap_get_irq_val(&dist->irq_cfg, 0, irq); > +} so vgic_bitmap_get_irq_val returns 0 for level and anything else for edge? Maybe an enum or something could make this clearer? Also, why not take a vcpu or cpuid parameter to pass through, rather than assuming 0? > + > /** > * vgic_reg_access - access vgic register > * @mmio: pointer to the data describing the mmio access > @@ -101,6 +145,280 @@ static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, > } > } > > +static bool handle_mmio_misc(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 reg; > + u32 u32off = offset & 3; u32off? Bitten by a regex perhaps? > + > + switch (offset & ~3) { > + case 0: /* CTLR */ > + reg = vcpu->kvm->arch.vgic.enabled; > + vgic_reg_access(mmio, ®, u32off, > + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); > + if (mmio->is_write) { > + vcpu->kvm->arch.vgic.enabled = reg & 1; > + vgic_update_state(vcpu->kvm); > + return true; > + } > + break; > + > + case 4: /* TYPER */ > + reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; > + reg |= (VGIC_NR_IRQS >> 5) - 1; > + vgic_reg_access(mmio, ®, u32off, > + ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); > + break; > + > + case 8: /* IIDR */ > + reg = 0x4B00043B; > + vgic_reg_access(mmio, ®, u32off, > + ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); > + break; > + } > + > + return false; > +} > + > +static bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + vgic_reg_access(mmio, NULL, offset, > + ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); > + return false; > +} > + > +static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, > + vcpu->vcpu_id, offset); > + vgic_reg_access(mmio, reg, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); > + if (mmio->is_write) { > + vgic_update_state(vcpu->kvm); > + return true; > + } > + > + return false; > +} > + > +static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled, > + vcpu->vcpu_id, offset); > + vgic_reg_access(mmio, reg, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); > + if (mmio->is_write) { > + if (offset < 4) /* Force SGI enabled */ > + *reg |= 0xffff; > + vgic_update_state(vcpu->kvm); > + return true; > + } > + > + return false; > +} > + > +static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, > + vcpu->vcpu_id, offset); > + vgic_reg_access(mmio, reg, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); > + if (mmio->is_write) { > + vgic_update_state(vcpu->kvm); > + return true; > + } > + > + return false; > +} > + > +static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, > + vcpu->vcpu_id, offset); > + vgic_reg_access(mmio, reg, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); > + if (mmio->is_write) { > + vgic_update_state(vcpu->kvm); > + return true; > + } > + > + return false; > +} > + > +static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, > + vcpu->vcpu_id, offset); > + vgic_reg_access(mmio, reg, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); > + return false; > +} What do you gain from returning a bool from the MMIO handlers? Why not assume that state has always been updated and kick the vcpus if something is pending? > + > +static u32 vgic_get_target_reg(struct kvm *kvm, int irq) > +{ > + struct vgic_dist *dist = &kvm->arch.vgic; > + struct kvm_vcpu *vcpu; > + int i, c; > + unsigned long *bmap; > + u32 val = 0; > + > + BUG_ON(irq & 3); > + BUG_ON(irq < 32); Again, these look scary because I can't see the offset sanity checking for the MMIO traps... > + > + irq -= 32; > + > + kvm_for_each_vcpu(c, vcpu, kvm) { > + bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); > + for (i = 0; i < 4; i++) Is that 4 from sizeof(unsigned long)? > + if (test_bit(irq + i, bmap)) > + val |= 1 << (c + i * 8); > + } > + > + return val; > +} > + > +static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq) > +{ > + struct vgic_dist *dist = &kvm->arch.vgic; > + struct kvm_vcpu *vcpu; > + int i, c; > + unsigned long *bmap; > + u32 target; > + > + BUG_ON(irq & 3); > + BUG_ON(irq < 32); > + > + irq -= 32; > + > + /* > + * Pick the LSB in each byte. This ensures we target exactly > + * one vcpu per IRQ. If the byte is null, assume we target > + * CPU0. > + */ > + for (i = 0; i < 4; i++) { > + int shift = i * 8; Is this from BITS_PER_BYTE? > + target = ffs((val >> shift) & 0xffU); > + target = target ? (target - 1) : 0; __ffs? > + dist->irq_spi_cpu[irq + i] = target; > + kvm_for_each_vcpu(c, vcpu, kvm) { > + bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); > + if (c == target) > + set_bit(irq + i, bmap); > + else > + clear_bit(irq + i, bmap); > + } > + } > +} > + > +static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 reg; > + > + /* We treat the banked interrupts targets as read-only */ > + if (offset < 32) { > + u32 roreg = 1 << vcpu->vcpu_id; > + roreg |= roreg << 8; > + roreg |= roreg << 16; > + > + vgic_reg_access(mmio, &roreg, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); > + return false; > + } > + > + reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U); > + vgic_reg_access(mmio, ®, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); > + if (mmio->is_write) { > + vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U); > + vgic_update_state(vcpu->kvm); > + return true; > + } > + > + return false; > +} > + > +static u32 vgic_cfg_expand(u16 val) > +{ > + u32 res = 0; > + int i; > + > + for (i = 0; i < 16; i++) > + res |= (val >> i) << (2 * i + 1); Ok, you've lost me on this one but replacing some of the magic numbers with the constants they represent would be much appreciated, please! > + > + return res; > +} > + > +static u16 vgic_cfg_compress(u32 val) > +{ > + u16 res = 0; > + int i; > + > + for (i = 0; i < 16; i++) > + res |= (val >> (i * 2 + 1)) << i; > + > + return res; > +} > + > +/* > + * The distributor uses 2 bits per IRQ for the CFG register, but the > + * LSB is always 0. As such, we only keep the upper bit, and use the > + * two above functions to compress/expand the bits > + */ > +static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 val; > + u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, > + vcpu->vcpu_id, offset >> 1); > + if (offset & 2) > + val = *reg >> 16; > + else > + val = *reg & 0xffff; > + > + val = vgic_cfg_expand(val); > + vgic_reg_access(mmio, &val, offset, > + ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); > + if (mmio->is_write) { > + if (offset < 4) { > + *reg = ~0U; /* Force PPIs/SGIs to 1 */ > + return false; > + } > + > + val = vgic_cfg_compress(val); > + if (offset & 2) { > + *reg &= 0xffff; > + *reg |= val << 16; > + } else { > + *reg &= 0xffff << 16; > + *reg |= val; > + } > + } > + > + return false; > +} > + > +static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, > + struct kvm_exit_mmio *mmio, u32 offset) > +{ > + u32 reg; > + vgic_reg_access(mmio, ®, offset, > + ACCESS_READ_RAZ | ACCESS_WRITE_VALUE); > + if (mmio->is_write) { > + vgic_dispatch_sgi(vcpu, reg); > + vgic_update_state(vcpu->kvm); > + return true; > + } > + > + return false; > +} > + > /* All this should be handled by kvm_bus_io_*()... FIXME!!! */ > struct mmio_range { > unsigned long base; > @@ -110,6 +428,66 @@ struct mmio_range { > }; > > static const struct mmio_range vgic_ranges[] = { > + { /* CTRL, TYPER, IIDR */ > + .base = 0, > + .len = 12, > + .handle_mmio = handle_mmio_misc, > + }, > + { /* IGROUPRn */ > + .base = 0x80, > + .len = VGIC_NR_IRQS / 8, > + .handle_mmio = handle_mmio_raz_wi, > + }, > + { /* ISENABLERn */ > + .base = 0x100, > + .len = VGIC_NR_IRQS / 8, > + .handle_mmio = handle_mmio_set_enable_reg, > + }, > + { /* ICENABLERn */ > + .base = 0x180, > + .len = VGIC_NR_IRQS / 8, > + .handle_mmio = handle_mmio_clear_enable_reg, > + }, > + { /* ISPENDRn */ > + .base = 0x200, > + .len = VGIC_NR_IRQS / 8, > + .handle_mmio = handle_mmio_set_pending_reg, > + }, > + { /* ICPENDRn */ > + .base = 0x280, > + .len = VGIC_NR_IRQS / 8, > + .handle_mmio = handle_mmio_clear_pending_reg, > + }, > + { /* ISACTIVERn */ > + .base = 0x300, > + .len = VGIC_NR_IRQS / 8, > + .handle_mmio = handle_mmio_raz_wi, > + }, > + { /* ICACTIVERn */ > + .base = 0x380, > + .len = VGIC_NR_IRQS / 8, > + .handle_mmio = handle_mmio_raz_wi, > + }, > + { /* IPRIORITYRn */ > + .base = 0x400, > + .len = VGIC_NR_IRQS, > + .handle_mmio = handle_mmio_priority_reg, > + }, > + { /* ITARGETSRn */ > + .base = 0x800, > + .len = VGIC_NR_IRQS, > + .handle_mmio = handle_mmio_target_reg, > + }, > + { /* ICFGRn */ > + .base = 0xC00, > + .len = VGIC_NR_IRQS / 4, > + .handle_mmio = handle_mmio_cfg_reg, > + }, > + { /* SGIRn */ > + .base = 0xF00, > + .len = 4, > + .handle_mmio = handle_mmio_sgi_reg, > + }, Why not #define the offset values for the base fields instead of commenting the literals? Will -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html