Re: [PATCH v3 06/17] KVM: PPC: Book3S HV: XIVE: add controls for the EQ configuration

Cédric Le Goater <clg@xxxxxxxx> · Mon, 18 Mar 2019 15:12:10 +0100

On 3/18/19 4:23 AM, David Gibson wrote:
> On Fri, Mar 15, 2019 at 01:05:58PM +0100, Cédric Le Goater wrote:
>> These controls will be used by the H_INT_SET_QUEUE_CONFIG and
>> H_INT_GET_QUEUE_CONFIG hcalls from QEMU to configure the underlying
>> Event Queue in the XIVE IC. They will also be used to restore the
>> configuration of the XIVE EQs and to capture the internal run-time
>> state of the EQs. Both 'get' and 'set' rely on an OPAL call to access
>> the EQ toggle bit and EQ index which are updated by the XIVE IC when
>> event notifications are enqueued in the EQ.
>>
>> The value of the guest physical address of the event queue is saved in
>> the XIVE internal xive_q structure for later use. That is when
>> migration needs to mark the EQ pages dirty to capture a consistent
>> memory state of the VM.
>>
>> To be noted that H_INT_SET_QUEUE_CONFIG does not require the extra
>> OPAL call setting the EQ toggle bit and EQ index to configure the EQ,
>> but restoring the EQ state will.
>>
>> Signed-off-by: Cédric Le Goater <clg@xxxxxxxx>
>> ---
>>
>>  Changes since v2 :
>>  
>>  - fixed comments on the KVM device attribute definitions
>>  - fixed check on supported EQ size to restrict to 64K pages
>>  - checked kvm_eq.flags that need to be zero
>>  - removed the OPAL call when EQ qtoggle bit and index are zero. 
>>
>>  arch/powerpc/include/asm/xive.h            |   2 +
>>  arch/powerpc/include/uapi/asm/kvm.h        |  21 ++
>>  arch/powerpc/kvm/book3s_xive.h             |   2 +
>>  arch/powerpc/kvm/book3s_xive.c             |  15 +-
>>  arch/powerpc/kvm/book3s_xive_native.c      | 232 +++++++++++++++++++++
>>  Documentation/virtual/kvm/devices/xive.txt |  31 +++
>>  6 files changed, 297 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
>> index b579a943407b..46891f321606 100644
>> --- a/arch/powerpc/include/asm/xive.h
>> +++ b/arch/powerpc/include/asm/xive.h
>> @@ -73,6 +73,8 @@ struct xive_q {
>>  	u32			esc_irq;
>>  	atomic_t		count;
>>  	atomic_t		pending_count;
>> +	u64			guest_qpage;
>> +	u32			guest_qsize;
>>  };
>>  
>>  /* Global enable flags for the XIVE support */
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index 12bb01baf0ae..1cd728c87d7c 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -679,6 +679,7 @@ struct kvm_ppc_cpu_char {
>>  #define KVM_DEV_XIVE_GRP_CTRL		1
>>  #define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source identifier */
>>  #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG	3	/* 64-bit source identifier */
>> +#define KVM_DEV_XIVE_GRP_EQ_CONFIG	4	/* 64-bit EQ identifier */
>>  
>>  /* Layout of 64-bit XIVE source attribute values */
>>  #define KVM_XIVE_LEVEL_SENSITIVE	(1ULL << 0)
>> @@ -694,4 +695,24 @@ struct kvm_ppc_cpu_char {
>>  #define KVM_XIVE_SOURCE_EISN_SHIFT	33
>>  #define KVM_XIVE_SOURCE_EISN_MASK	0xfffffffe00000000ULL
>>  
>> +/* Layout of 64-bit EQ identifier */
>> +#define KVM_XIVE_EQ_PRIORITY_SHIFT	0
>> +#define KVM_XIVE_EQ_PRIORITY_MASK	0x7
>> +#define KVM_XIVE_EQ_SERVER_SHIFT	3
>> +#define KVM_XIVE_EQ_SERVER_MASK		0xfffffff8ULL
>> +
>> +/* Layout of EQ configuration values (64 bytes) */
>> +struct kvm_ppc_xive_eq {
>> +	__u32 flags;
>> +	__u32 qsize;
>> +	__u64 qpage;
>> +	__u32 qtoggle;
>> +	__u32 qindex;
>> +	__u8  pad[40];
>> +};
>> +
>> +#define KVM_XIVE_EQ_FLAG_ENABLED	0x00000001
>> +#define KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY	0x00000002
>> +#define KVM_XIVE_EQ_FLAG_ESCALATE	0x00000004
>> +
>>  #endif /* __LINUX_KVM_POWERPC_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
>> index ae26fe653d98..622f594d93e1 100644
>> --- a/arch/powerpc/kvm/book3s_xive.h
>> +++ b/arch/powerpc/kvm/book3s_xive.h
>> @@ -272,6 +272,8 @@ struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
>>  	struct kvmppc_xive *xive, int irq);
>>  void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
>>  int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
>> +int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
>> +				  bool single_escalation);
>>  
>>  #endif /* CONFIG_KVM_XICS */
>>  #endif /* _KVM_PPC_BOOK3S_XICS_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>> index e09f3addffe5..c1b7aa7dbc28 100644
>> --- a/arch/powerpc/kvm/book3s_xive.c
>> +++ b/arch/powerpc/kvm/book3s_xive.c
>> @@ -166,7 +166,8 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
>>  	return IRQ_HANDLED;
>>  }
>>  
>> -static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
>> +int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
>> +				  bool single_escalation)
>>  {
>>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>>  	struct xive_q *q = &xc->queues[prio];
>> @@ -185,7 +186,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
>>  		return -EIO;
>>  	}
>>  
>> -	if (xc->xive->single_escalation)
>> +	if (single_escalation)
>>  		name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
>>  				 vcpu->kvm->arch.lpid, xc->server_num);
>>  	else
>> @@ -217,7 +218,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
>>  	 * interrupt, thus leaving it effectively masked after
>>  	 * it fires once.
>>  	 */
>> -	if (xc->xive->single_escalation) {
>> +	if (single_escalation) {
>>  		struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
>>  		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
>>  
>> @@ -291,7 +292,8 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
>>  			continue;
>>  		rc = xive_provision_queue(vcpu, prio);
>>  		if (rc == 0 && !xive->single_escalation)
>> -			xive_attach_escalation(vcpu, prio);
>> +			kvmppc_xive_attach_escalation(vcpu, prio,
>> +						      xive->single_escalation);
>>  		if (rc)
>>  			return rc;
>>  	}
>> @@ -1214,7 +1216,8 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>>  		if (xive->qmap & (1 << i)) {
>>  			r = xive_provision_queue(vcpu, i);
>>  			if (r == 0 && !xive->single_escalation)
>> -				xive_attach_escalation(vcpu, i);
>> +				kvmppc_xive_attach_escalation(
>> +					vcpu, i, xive->single_escalation);
>>  			if (r)
>>  				goto bail;
>>  		} else {
>> @@ -1229,7 +1232,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>>  	}
>>  
>>  	/* If not done above, attach priority 0 escalation */
>> -	r = xive_attach_escalation(vcpu, 0);
>> +	r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
>>  	if (r)
>>  		goto bail;
>>  
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index b841d339f674..42e824658a30 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -340,6 +340,226 @@ static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
>>  						       priority, masked, eisn);
>>  }
>>  
>> +static int xive_native_validate_queue_size(u32 qsize)
>> +{
>> +	/*
>> +	 * We only support 64K pages for the moment. This is also
>> +	 * advertised in the DT property "ibm,xive-eq-sizes"
> 
> IIUC, that won't work properly if you had a guest using 4kiB pages.

> That's fine, but do we have somewhere that checks for that case and
> throws a suitable error?

Not in the device. 

So, we should check the current page_size of the guest ? Is there a way 
to do that simply from KVM ? 

>> +	 */
>> +	switch (qsize) {
>> +	case 0: /* EQ reset */
>> +	case 16:
>> +		return 0;
>> +	case 12:
>> +	case 21:
>> +	case 24:
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +}
>> +
>> +static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
>> +					       long eq_idx, u64 addr)
>> +{
>> +	struct kvm *kvm = xive->kvm;
>> +	struct kvm_vcpu *vcpu;
>> +	struct kvmppc_xive_vcpu *xc;
>> +	void __user *ubufp = (u64 __user *) addr;
> 
> Nit: that should be (void __user *) on the right, shouldn't it?

yes.

> 
>> +	u32 server;
>> +	u8 priority;
>> +	struct kvm_ppc_xive_eq kvm_eq;
>> +	int rc;
>> +	__be32 *qaddr = 0;
>> +	struct page *page;
>> +	struct xive_q *q;
>> +
>> +	/*
>> +	 * Demangle priority/server tuple from the EQ identifier
>> +	 */
>> +	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
>> +		KVM_XIVE_EQ_PRIORITY_SHIFT;
>> +	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
>> +		KVM_XIVE_EQ_SERVER_SHIFT;
>> +
>> +	if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
>> +		return -EFAULT;
>> +
>> +	vcpu = kvmppc_xive_find_server(kvm, server);
>> +	if (!vcpu) {
>> +		pr_err("Can't find server %d\n", server);
>> +		return -ENOENT;
>> +	}
>> +	xc = vcpu->arch.xive_vcpu;
>> +
>> +	if (priority != xive_prio_from_guest(priority)) {
>> +		pr_err("Trying to restore invalid queue %d for VCPU %d\n",
>> +		       priority, server);
>> +		return -EINVAL;
>> +	}
>> +	q = &xc->queues[priority];
>> +
>> +	pr_devel("%s VCPU %d priority %d fl:%x sz:%d addr:%llx g:%d idx:%d\n",
>> +		 __func__, server, priority, kvm_eq.flags,
>> +		 kvm_eq.qsize, kvm_eq.qpage, kvm_eq.qtoggle, kvm_eq.qindex);
>> +
>> +	/*
>> +	 * We can not tune the EQ configuration from user space. All
>> +	 * is done in OPAL.
>> +	 */
>> +	if (kvm_eq.flags != 0) {
>> +		pr_err("invalid flags %d\n", kvm_eq.flags);
>> +		return -EINVAL;
>> +	}
>> +
>> +	rc = xive_native_validate_queue_size(kvm_eq.qsize);
>> +	if (rc) {
>> +		pr_err("invalid queue size %d\n", kvm_eq.qsize);
>> +		return rc;
>> +	}
>> +
>> +	/* reset queue and disable queueing */
>> +	if (!kvm_eq.qsize) {
>> +		q->guest_qpage = 0;
>> +		q->guest_qsize = 0;
>> +
>> +		rc = xive_native_configure_queue(xc->vp_id, q, priority,
>> +						 NULL, 0, true);
>> +		if (rc) {
>> +			pr_err("Failed to reset queue %d for VCPU %d: %d\n",
>> +			       priority, xc->server_num, rc);
>> +			return rc;
>> +		}
>> +
>> +		if (q->qpage) {
>> +			put_page(virt_to_page(q->qpage));
>> +			q->qpage = NULL;
>> +		}
>> +
>> +		return 0;
>> +	}
>> +
>> +
>> +	page = gfn_to_page(kvm, gpa_to_gfn(kvm_eq.qpage));
>> +	if (is_error_page(page)) {
>> +		pr_warn("Couldn't get guest page for %llx!\n", kvm_eq.qpage);
>> +		return -EINVAL;
>> +	}
> 
> Yeah.. for the case of a 4kiB page host (these days weird, but not
> actually prohibited, AFAIK) you need to check that the qsize selected
> actually fits within the page.

Ah yes. sure.

>> +	qaddr = page_to_virt(page) + (kvm_eq.qpage & ~PAGE_MASK);
>> +
>> +	/* Backup queue page guest address for migration */
> 
> Hm.. KVM itself shouldn't generally need to know about migration.
> IIUC these values won't change from what qemu set them to be, so it
> should be able to store and migrate them without have to get them back
> from the kernel.

Euh. You are completely right. I don't know why I kept those around. 
>> +	q->guest_qpage = kvm_eq.qpage;
>> +	q->guest_qsize = kvm_eq.qsize;
>> +
>> +	rc = xive_native_configure_queue(xc->vp_id, q, priority,
>> +					 (__be32 *) qaddr, kvm_eq.qsize, true);
>> +	if (rc) {
>> +		pr_err("Failed to configure queue %d for VCPU %d: %d\n",
>> +		       priority, xc->server_num, rc);
>> +		put_page(page);
>> +		return rc;
>> +	}
>> +
>> +	/*
>> +	 * Only restore the queue state when needed. When doing the
>> +	 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
>> +	 */
>> +	if (kvm_eq.qtoggle != 0 || kvm_eq.qindex != 0) {
>> +		rc = xive_native_set_queue_state(xc->vp_id, priority,
>> +						 kvm_eq.qtoggle,
>> +						 kvm_eq.qindex);
>> +		if (rc)
>> +			goto error;
>> +	}
>> +
>> +	rc = kvmppc_xive_attach_escalation(vcpu, priority,
>> +					   xive->single_escalation);
>> +error:
>> +	if (rc)
>> +		kvmppc_xive_native_cleanup_queue(vcpu, priority);
>> +	return rc;
>> +}
>> +
>> +static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
>> +					       long eq_idx, u64 addr)
>> +{
>> +	struct kvm *kvm = xive->kvm;
>> +	struct kvm_vcpu *vcpu;
>> +	struct kvmppc_xive_vcpu *xc;
>> +	struct xive_q *q;
>> +	void __user *ubufp = (u64 __user *) addr;
>> +	u32 server;
>> +	u8 priority;
>> +	struct kvm_ppc_xive_eq kvm_eq;
>> +	u64 qpage;
>> +	u64 qsize;
>> +	u64 qeoi_page;
>> +	u32 escalate_irq;
>> +	u64 qflags;
>> +	int rc;
>> +
>> +	/*
>> +	 * Demangle priority/server tuple from the EQ identifier
>> +	 */
>> +	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
>> +		KVM_XIVE_EQ_PRIORITY_SHIFT;
>> +	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
>> +		KVM_XIVE_EQ_SERVER_SHIFT;
>> +
>> +	vcpu = kvmppc_xive_find_server(kvm, server);
>> +	if (!vcpu) {
>> +		pr_err("Can't find server %d\n", server);
>> +		return -ENOENT;
>> +	}
>> +	xc = vcpu->arch.xive_vcpu;
>> +
>> +	if (priority != xive_prio_from_guest(priority)) {
>> +		pr_err("invalid priority for queue %d for VCPU %d\n",
>> +		       priority, server);
>> +		return -EINVAL;
>> +	}
>> +	q = &xc->queues[priority];
>> +
>> +	memset(&kvm_eq, 0, sizeof(kvm_eq));
>> +
>> +	if (!q->qpage)
>> +		return 0;
>> +
>> +	rc = xive_native_get_queue_info(xc->vp_id, priority, &qpage, &qsize,
>> +					&qeoi_page, &escalate_irq, &qflags);
>> +	if (rc)
>> +		return rc;
>> +
>> +	/*
>> +	 * Return some information on the EQ configuration in
>> +	 * OPAL. This is purely informative for now as we can't really
>> +	 * tune the EQ configuration from user space.
>> +	 */
>> +	kvm_eq.flags = 0;
>> +	if (qflags & OPAL_XIVE_EQ_ENABLED)
>> +		kvm_eq.flags |= KVM_XIVE_EQ_FLAG_ENABLED;
>> +	if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
>> +		kvm_eq.flags |= KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY;
>> +	if (qflags & OPAL_XIVE_EQ_ESCALATE)
>> +		kvm_eq.flags |= KVM_XIVE_EQ_FLAG_ESCALATE;
> 
> If there's not really anything it can do about it, does it make sense
> to even expose this info to userspace?

Hmm, good question. 

 - KVM_XIVE_EQ_FLAG_ENABLED		
	may be uselessly obvious.

 - KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY 	
	means we do not use the END ESBs to coalesce the events at the END
	level. This flag is reflected by the XIVE_EQ_ALWAYS_NOTIFY option
	in the sPAPR specs. We don't support the END ESBs but we might one
	day.

 - KVM_XIVE_EQ_FLAG_ESCALATE
	means the EQ is an escalation. QEMU doesn't really care for now 
	but it's an important information I think.

I tried not to add too many of the END flags, only the relevant ones which
could have an impact in the future modeling.  

I think KVM_XIVE_EQ_FLAG_ALWAYS_NOTIFY is important. I was setting it from
QEMU in the hcall but as OPAL does the same blindly I removed it in v3. 

>> +	kvm_eq.qsize = q->guest_qsize;
>> +	kvm_eq.qpage = q->guest_qpage;
> 
>> +	rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
>> +					 &kvm_eq.qindex);
>> +	if (rc)
>> +		return rc;
>> +
>> +	pr_devel("%s VCPU %d priority %d fl:%x sz:%d addr:%llx g:%d idx:%d\n",
>> +		 __func__, server, priority, kvm_eq.flags,
>> +		 kvm_eq.qsize, kvm_eq.qpage, kvm_eq.qtoggle, kvm_eq.qindex);
>> +
>> +	if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
>> +		return -EFAULT;
>> +
>> +	return 0;
>> +}
>> +
>>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  				       struct kvm_device_attr *attr)
>>  {
>> @@ -354,6 +574,9 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
>>  		return kvmppc_xive_native_set_source_config(xive, attr->attr,
>>  							    attr->addr);
>> +	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
>> +		return kvmppc_xive_native_set_queue_config(xive, attr->attr,
>> +							   attr->addr);
>>  	}
>>  	return -ENXIO;
>>  }
>> @@ -361,6 +584,13 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
>>  				       struct kvm_device_attr *attr)
>>  {
>> +	struct kvmppc_xive *xive = dev->private;
>> +
>> +	switch (attr->group) {
>> +	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
>> +		return kvmppc_xive_native_get_queue_config(xive, attr->attr,
>> +							   attr->addr);
>> +	}
>>  	return -ENXIO;
>>  }
>>  
>> @@ -376,6 +606,8 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>>  		    attr->attr < KVMPPC_XIVE_NR_IRQS)
>>  			return 0;
>>  		break;
>> +	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
>> +		return 0;
>>  	}
>>  	return -ENXIO;
>>  }
>> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
>> index 33c64b2cdbe8..a4de64f6e79c 100644
>> --- a/Documentation/virtual/kvm/devices/xive.txt
>> +++ b/Documentation/virtual/kvm/devices/xive.txt
>> @@ -53,3 +53,34 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>      -ENXIO:  CPU event queues not configured or configuration of the
>>               underlying HW interrupt failed
>>      -EBUSY:  No CPU available to serve interrupt
>> +
>> +  4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write)
>> +  Configures an event queue of a CPU
>> +  Attributes:
>> +    EQ descriptor identifier (64-bit)
>> +  The EQ descriptor identifier is a tuple (server, priority) :
>> +  bits:     | 63   ....  32 | 31 .. 3 |  2 .. 0
>> +  values:   |    unused     |  server | priority
>> +  The kvm_device_attr.addr points to :
>> +    struct kvm_ppc_xive_eq {
>> +	__u32 flags;
>> +	__u32 qsize;
>> +	__u64 qpage;
>> +	__u32 qtoggle;
>> +	__u32 qindex;
>> +	__u8  pad[40];
>> +    };
>> +  - flags: queue flags
>> +  - qsize: queue size (power of 2)
>> +  - qpage: real address of queue
>> +  - qtoggle: current queue toggle bit
>> +  - qindex: current queue index
>> +  - pad: reserved for future use
>> +  Errors:
>> +    -ENOENT: Invalid CPU number
>> +    -EINVAL: Invalid priority
>> +    -EINVAL: Invalid flags
>> +    -EINVAL: Invalid queue size
>> +    -EINVAL: Invalid queue address
>> +    -EFAULT: Invalid user pointer for attr->addr.
>> +    -EIO:    Configuration of the underlying HW failed
>