This adds the API for userspace to instantiate an XICS device in a VM and connect VCPUs to it. The API consists of a new device type for the KVM_CREATE_DEVICE ioctl, a new capability KVM_CAP_IRQ_XICS, which functions similarly to KVM_CAP_IRQ_MPIC, and the KVM_IRQ_LINE ioctl, which is used to assert and deassert interrupt inputs of the XICS. The XICS device has one attribute group, KVM_DEV_XICS_GRP_SOURCES. Each attribute within this group corresponds to the state of one interrupt source. The attribute number is the same as the interrupt source number. Signed-off-by: Paul Mackerras <paulus@xxxxxxxxx> --- Documentation/virtual/kvm/api.txt | 8 ++ Documentation/virtual/kvm/devices/xics.txt | 66 +++++++++ arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/include/uapi/asm/kvm.h | 12 ++ arch/powerpc/kvm/book3s_xics.c | 211 ++++++++++++++++++++++++++-- arch/powerpc/kvm/book3s_xics.h | 6 + arch/powerpc/kvm/powerpc.c | 35 +++++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 2 + virt/kvm/kvm_main.c | 5 + 10 files changed, 339 insertions(+), 9 deletions(-) create mode 100644 Documentation/virtual/kvm/devices/xics.txt diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 54bb6ad..db230f8 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2756,3 +2756,11 @@ Parameters: args[0] is the MPIC device fd args[1] is the MPIC CPU number for this vcpu This capability connects the vcpu to an in-kernel MPIC device. + +6.7 KVM_CAP_IRQ_XICS + +Architectures: ppc +Parameters: args[0] is the XICS device fd + args[1] is the XICS CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XICS device. diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virtual/kvm/devices/xics.txt new file mode 100644 index 0000000..4286493 --- /dev/null +++ b/Documentation/virtual/kvm/devices/xics.txt @@ -0,0 +1,66 @@ +XICS interrupt controller + +Device type supported: KVM_DEV_TYPE_XICS + +Groups: + KVM_DEV_XICS_SOURCES + Attributes: One per interrupt source, indexed by the source number. + +This device emulates the XICS (eXternal Interrupt Controller +Specification) defined in PAPR. The XICS has a set of interrupt +sources, each identified by a 20-bit source number, and a set of +Interrupt Control Presentation (ICP) entities, also called "servers", +each associated with a virtual CPU. + +The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH +capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and +the interrupt server number (i.e. the vcpu number from the XICS's +point of view) in args[1] of the kvm_enable_cap struct. Each ICP has +64 bits of state which can be read and written using the +KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit +state word has the following bitfields, starting at the +least-significant end of the word: + +* Unused, 16 bits + +* Pending interrupt priority, 8 bits + Zero is the highest priority, 255 means no interrupt is pending. + +* Pending IPI (inter-processor interrupt) priority, 8 bits + Zero is the highest priority, 255 means no IPI is pending. + +* Pending interrupt source number, 24 bits + Zero means no interrupt pending, 2 means an IPI is pending + +* Current processor priority, 8 bits + Zero is the highest priority, meaning no interrupts can be + delivered, and 255 is the lowest priority. + +Each source has 64 bits of state that can be read and written using +the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the +KVM_DEV_XICS_SOURCES attribute group, with the attribute number being +the interrupt source number. The 64 bit state word has the following +bitfields, starting from the least-significant end of the word: + +* Destination (server number), 32 bits + This specifies where the interrupt should be sent, and is the + interrupt server number specified for the destination vcpu. + +* Priority, 8 bits + This is the priority specified for this interrupt source, where 0 is + the highest priority and 255 is the lowest. An interrupt with a + priority of 255 will never be delivered. + +* Level sensitive flag, 1 bit + This bit is 1 for a level-sensitive interrupt source, or 0 for + edge-sensitive (or MSI). + +* Masked flag, 1 bit + This bit is set to 1 if the interrupt is masked (cannot be delivered + regardless of its priority), for example by the ibm,int-off RTAS + call, or 0 if it is not masked. + +* Pending flag, 1 bit + This bit is 1 if the source has a pending interrupt, otherwise 0. + +Only one XICS instance may be created per VM. diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 44c8ea1..6041406 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -315,6 +315,8 @@ extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); +extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 cpu); #else static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 1388282..ed5f6a2 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -451,4 +451,16 @@ struct kvm_get_htab_header { #define KVM_DEV_MPIC_GRP_REGISTER 2 /* 32-bit */ #define KVM_DEV_MPIC_GRP_IRQ_ACTIVE 3 /* 32-bit */ +/* PPC64 eXternal Interrupt Controller Specification */ +#define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ + +/* Layout of 64-bit source attribute values */ +#define KVM_XICS_DESTINATION_SHIFT 0 +#define KVM_XICS_DESTINATION_MASK 0xffffffffULL +#define KVM_XICS_PRIORITY_SHIFT 32 +#define KVM_XICS_PRIORITY_MASK 0xff +#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) +#define KVM_XICS_MASKED (1ULL << 41) +#define KVM_XICS_PENDING (1ULL << 42) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index ee841ed..0189f72 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -11,6 +11,7 @@ #include <linux/kvm_host.h> #include <linux/err.h> #include <linux/gfp.h> +#include <linux/anon_inodes.h> #include <asm/uaccess.h> #include <asm/kvm_book3s.h> @@ -45,6 +46,34 @@ */ /* + * Lifetimes + * ========= + * + * The kvm_device code takes a reference to the struct kvm, so the + * struct kvm will exist while the device fd is open. That implies + * that we can rely on the presence of the struct kvm in the device + * get_attr and set_attr calls. Similarly, we can rely on the presence + * of the struct kvm in anything called from within a vcpu KVM_RUN + * call, i.e. any hcall or rtas call. + * + * Vcpus take a reference to the kvm_device, which is only dropped in + * kvmppc_xics_free_icp, which only gets called when the vcpu struct + * is freed, which only happens (currently) when the kvm struct is + * being freed. That implies that we can rely on the presence of the + * kvm_device and the struct kvmppc_xics (which has the same lifetime) + * in any hcall or rtas call. We can also rely on the kvm->xics pointer + * in those circumstances, since it only gets cleared kvmppc_xics_free. + * + * The remaining case is then the kvm_vm_ioctl_xics_irq function, + * called from the KVM_IRQ_LINE vm ioctl. If there are no vcpus + * connected, it would be possible for the kvm_device and the struct + * kvmppc_xics to go away concurrently with kvm_vm_ioctl_xics_irq. + * To guard against this possibility, we use an RCU read lock in + * kvm_vm_ioctl_xics_irq and free the struct kvmppc_xics and associated + * structures with kfree_rcu(). + */ + +/* * TODO * ==== * @@ -55,8 +84,6 @@ * * - Make ICS lockless as well, or at least a per-interrupt lock or hashed * locks array to improve scalability - * - * - ioctl's to save/restore the entire state for snapshot & migration */ /* -- ICS routines -- */ @@ -891,8 +918,8 @@ static void xics_debugfs_init(struct kvmppc_xics *xics) kfree(name); } -struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, - struct kvmppc_xics *xics, int irq) +static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, + struct kvmppc_xics *xics, int irq) { struct kvmppc_ics *ics; int i, icsid; @@ -1044,6 +1071,93 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) return 0; } +static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + int ret; + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val, prio; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return -ENOENT; + + irqp = &ics->irq_state[idx]; + mutex_lock(&ics->lock); + ret = -ENOENT; + if (irqp->exists) { + val = irqp->server; + prio = irqp->priority; + if (prio == MASKED) { + val |= KVM_XICS_MASKED; + prio = irqp->saved_priority; + } + val |= prio << KVM_XICS_PRIORITY_SHIFT; + if (irqp->asserted) + val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; + else if (irqp->masked_pending || irqp->resend) + val |= KVM_XICS_PENDING; + ret = 0; + } + mutex_unlock(&ics->lock); + + if (!ret && put_user(val, ubufp)) + ret = -EFAULT; + + return ret; +} + +static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val; + u8 prio; + u32 server; + + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) + return -ENOENT; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) { + ics = kvmppc_xics_create_ics(xics->kvm, xics, irq); + if (!ics) + return -ENOMEM; + } + irqp = &ics->irq_state[idx]; + if (get_user(val, ubufp)) + return -EFAULT; + + server = val & KVM_XICS_DESTINATION_MASK; + prio = val >> KVM_XICS_PRIORITY_SHIFT; + if (prio != MASKED && + kvmppc_xics_find_server(xics->kvm, server) == NULL) + return -EINVAL; + + mutex_lock(&ics->lock); + irqp->server = server; + irqp->saved_priority = prio; + if (val & KVM_XICS_MASKED) + prio = MASKED; + irqp->priority = prio; + irqp->resend = 0; + irqp->masked_pending = 0; + irqp->asserted = 0; + if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) + irqp->asserted = 1; + irqp->exists = 1; + mutex_unlock(&ics->lock); + + if (val & KVM_XICS_PENDING) + icp_deliver_irq(xics, NULL, irqp->number); + + return 0; +} + /* -- ioctls -- */ int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args) @@ -1053,9 +1167,11 @@ int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args) /* locking against multiple callers? */ + rcu_read_lock(); + r = -ENODEV; xics = kvm->arch.xics; if (!xics) - return -ENODEV; + goto out; switch (args->level) { case KVM_INTERRUPT_SET: @@ -1067,11 +1183,48 @@ int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args) r = -EINVAL; } + out: + rcu_read_unlock(); return r; } -void kvmppc_xics_free(struct kvmppc_xics *xics) +static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_set_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_get_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && + attr->attr < KVMPPC_XICS_NR_IRQS) + return 0; + break; + } + return -ENXIO; +} + +static void kvmppc_xics_free(struct kvm_device *dev) { + struct kvmppc_xics *xics = dev->private; int i; struct kvm *kvm = xics->kvm; @@ -1081,19 +1234,24 @@ void kvmppc_xics_free(struct kvmppc_xics *xics) kvm->arch.xics = NULL; for (i = 0; i <= xics->max_icsid; i++) - kfree(xics->ics[i]); - kfree(xics); + if (xics->ics[i]) + kfree_rcu(xics->ics[i], rhead); + kfree_rcu(xics, rhead); + kfree(dev); } -int kvm_xics_create(struct kvm *kvm, u32 type) +static int kvmppc_xics_create(struct kvm_device *dev, u32 type) { struct kvmppc_xics *xics; + struct kvm *kvm = dev->kvm; int ret = 0; xics = kzalloc(sizeof(*xics), GFP_KERNEL); if (!xics) return -ENOMEM; + dev->private = xics; + xics->dev = dev; xics->kvm = kvm; /* Already there ? */ @@ -1120,11 +1278,46 @@ int kvm_xics_create(struct kvm *kvm, u32 type) return 0; } +struct kvm_device_ops kvm_xics_ops = { + .name = "kvm-xics", + .create = kvmppc_xics_create, + .destroy = kvmppc_xics_free, + .set_attr = xics_set_attr, + .get_attr = xics_get_attr, + .has_attr = xics_has_attr, +}; + +int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 xcpu) +{ + struct kvmppc_xics *xics = dev->private; + int r = -EBUSY; + + if (dev->ops != &kvm_xics_ops) + return -EPERM; + if (xics->kvm != vcpu->kvm) + return -EPERM; + if (vcpu->arch.irq_type) + return -EBUSY; + + r = kvmppc_xics_create_icp(vcpu, xcpu); + if (!r) { + vcpu->arch.irq_type = KVMPPC_IRQ_XICS; + kvm_device_get(xics->dev); + } + + return r; +} + void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + if (!vcpu->arch.icp) return; kfree(vcpu->arch.icp); vcpu->arch.icp = NULL; vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; + if (xics) + kvm_device_put(xics->dev); } diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index e4fdec3..939074c 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -10,6 +10,9 @@ #ifndef _KVM_PPC_BOOK3S_XICS_H #define _KVM_PPC_BOOK3S_XICS_H +#include <linux/types.h> +#include <linux/rcupdate.h> + /* * We use a two-level tree to store interrupt source information. * There are up to 1024 ICS nodes, each of which can represent @@ -83,12 +86,15 @@ struct kvmppc_icp { struct kvmppc_ics { struct mutex lock; u16 icsid; + struct rcu_head rhead; struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; }; struct kvmppc_xics { struct kvm *kvm; + struct kvm_device *dev; struct dentry *dentry; + struct rcu_head rhead; u32 max_icsid; bool real_mode; bool real_mode_dbg; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2441b81..232a174 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -341,6 +341,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_SPAPR_TCE: case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: +#endif r = 1; break; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -829,6 +832,25 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } #endif +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: { + struct file *filp; + struct kvm_device *dev; + + r = -EBADF; + filp = fget(cap->args[0]); + if (!filp) + break; + + r = -EPERM; + dev = kvm_device_from_filp(filp); + if (dev) + r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); + + fput(filp); + break; + } +#endif /* CONFIG_KVM_XICS */ default: r = -EINVAL; break; @@ -1037,6 +1059,19 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } #endif /* CONFIG_PPC_BOOK3S_64 */ + + case KVM_IRQ_LINE: { + struct kvm *kvm = filp->private_data; + struct kvm_irq_level args; + + r = -EFAULT; + if (copy_from_user(&args, argp, sizeof(args))) + break; + + /* Call all the interrupt controllers */ + r = kvm_vm_ioctl_xics_irq(kvm, &args); + break; + } default: r = -ENOTTY; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 55d3d9c..adfb42c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1102,6 +1102,7 @@ void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); extern struct kvm_device_ops kvm_mpic_ops; +extern struct kvm_device_ops kvm_xics_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d100ac6..7189175 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -671,6 +671,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_DEVICE_CTRL 89 #define KVM_CAP_IRQ_MPIC 90 #define KVM_CAP_PPC_RTAS 91 +#define KVM_CAP_IRQ_XICS 92 #ifdef KVM_CAP_IRQ_ROUTING @@ -933,6 +934,7 @@ struct kvm_device_attr { #define KVM_DEV_TYPE_FSL_MPIC_20 1 #define KVM_DEV_TYPE_FSL_MPIC_42 2 +#define KVM_DEV_TYPE_XICS 3 /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a51869..3ba2076 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2244,6 +2244,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, ops = &kvm_mpic_ops; break; #endif +#ifdef CONFIG_KVM_XICS + case KVM_DEV_TYPE_XICS: + ops = &kvm_xics_ops; + break; +#endif default: return -ENODEV; } -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html