Re: [RFC v2 2/4] KVM: x86: add support for ioregionfd signal handling

Elena Afanasova <eafanasova@xxxxxxxxx> · Wed, 10 Feb 2021 11:06:16 -0800

On Tue, 2021-02-09 at 14:21 +0800, Jason Wang wrote:
> On 2021/1/29 上午2:32, Elena Afanasova wrote:
> > The vCPU thread may receive a signal during ioregionfd
> > communication,
> > ioctl(KVM_RUN) needs to return to userspace and then ioctl(KVM_RUN)
> > must resume ioregionfd.
> 
> It looks to me the patch contains much more than just signal
> handling 
> (e.g the protocol). Please split.
> 
Ok

> 
> > Signed-off-by: Elena Afanasova <eafanasova@xxxxxxxxx>
> > ---
> > Changes in v2:
> >    - add support for x86 signal handling
> >    - changes after code review
> > 
> >   arch/x86/kvm/x86.c            | 196
> > +++++++++++++++++++++++++++++++---
> >   include/linux/kvm_host.h      |  13 +++
> >   include/uapi/linux/ioregion.h |  32 ++++++
> >   virt/kvm/ioregion.c           | 177
> > +++++++++++++++++++++++++++++-
> >   virt/kvm/kvm_main.c           |  16 ++-
> >   5 files changed, 415 insertions(+), 19 deletions(-)
> >   create mode 100644 include/uapi/linux/ioregion.h
> 
> I wonder whether it's better to split into two patches:
> 
> 1) general signal support for KVM I/O device
> 2) the ioregionfd part
> 
> 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index ddb28f5ca252..a04516b531da 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5799,19 +5799,33 @@ static int vcpu_mmio_write(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len,
> >   {
> >   	int handled = 0;
> >   	int n;
> > +	int ret = 0;
> > +	bool is_apic;
> >   
> >   	do {
> >   		n = min(len, 8);
> > -		if (!(lapic_in_kernel(vcpu) &&
> > -		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev,
> > addr, n, v))
> > -		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n,
> > v))
> > -			break;
> > +		is_apic = lapic_in_kernel(vcpu) &&
> > +			  !kvm_iodevice_write(vcpu, &vcpu->arch.apic-
> > >dev,
> > +					      addr, n, v);
> 
> A better name is needed since "is_apic" only covers the first
> condition.
> 
> 
> > +		if (!is_apic) {
> > +			ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS,
> > +					       addr, n, v);
> > +			if (ret)
> > +				break;
> > +		}
> >   		handled += n;
> >   		addr += n;
> >   		len -= n;
> >   		v += n;
> >   	} while (len);
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> 
> My understanding is that, we should check ERESTARTSYS instead of
> EINTR. 
> EINTR means the syscall can't be restarted.
> 
I think the case when ioregionfd communication is interrupted can be
seen as interrupted ioctl(KVM_RUN).

> E.g we had the following errno for sockets:
> 
> /* Alas, with timeout socket operations are not restartable.
>   * Compare this to poll().
>   */
> static inline int sock_intr_errno(long timeo)
> {
>      return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
> }
> 
> For the case of EINTR, do we need a fallback to vcpu userspace
> process 
> (Qemu)?
> 
Yes, ioctl(KVM_RUN) needs to return to userspace.

> And we probably need a trace point here.
> 
> 
> > +	}
> > +#endif
> > +
> >   	return handled;
> >   }
> >   
> > @@ -5819,14 +5833,20 @@ static int vcpu_mmio_read(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len, void *v)
> >   {
> >   	int handled = 0;
> >   	int n;
> > +	int ret = 0;
> > +	bool is_apic;
> >   
> >   	do {
> >   		n = min(len, 8);
> > -		if (!(lapic_in_kernel(vcpu) &&
> > -		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
> > -					 addr, n, v))
> > -		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
> > -			break;
> > +		is_apic = lapic_in_kernel(vcpu) &&
> > +			  !kvm_iodevice_read(vcpu, &vcpu->arch.apic-
> > >dev,
> > +					     addr, n, v);
> > +		if (!is_apic) {
> > +			ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS,
> > +					      addr, n, v);
> > +			if (ret)
> > +				break;
> > +		}
> >   		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
> >   		handled += n;
> >   		addr += n;
> > @@ -5834,6 +5854,13 @@ static int vcpu_mmio_read(struct kvm_vcpu
> > *vcpu, gpa_t addr, int len, void *v)
> >   		v += n;
> >   	} while (len);
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> > +	}
> > +#endif
> > +
> >   	return handled;
> >   }
> >   
> > @@ -6294,6 +6321,12 @@ static int emulator_read_write(struct
> > x86_emulate_ctxt *ctxt,
> >   	vcpu->mmio_needed = 1;
> >   	vcpu->mmio_cur_fragment = 0;
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (vcpu->ioregion_interrupted &&
> > +	    vcpu->run->exit_reason == KVM_EXIT_INTR)
> > +		return (vcpu->ioregion_ctx.in) ? X86EMUL_IO_NEEDED :
> > X86EMUL_CONTINUE;
> > +#endif
> > +
> >   	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
> >   	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
> >   	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > @@ -6411,16 +6444,23 @@ static int kernel_pio(struct kvm_vcpu
> > *vcpu, void *pd)
> >   
> >   	for (i = 0; i < vcpu->arch.pio.count; i++) {
> >   		if (vcpu->arch.pio.in)
> > -			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu-
> > >arch.pio.port,
> > +			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS,
> > +					    vcpu->arch.pio.port,
> >   					    vcpu->arch.pio.size, pd);
> >   		else
> >   			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
> > -					     vcpu->arch.pio.port, vcpu-
> > >arch.pio.size,
> > -					     pd);
> > +					     vcpu->arch.pio.port,
> > +					     vcpu->arch.pio.size, pd);
> >   		if (r)
> >   			break;
> >   		pd += vcpu->arch.pio.size;
> >   	}
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (vcpu->ioregion_interrupted && r == -EINTR) {
> > +		vcpu->ioregion_ctx.pio = i;
> > +	}
> > +#endif
> > +
> >   	return r;
> >   }
> >   
> > @@ -6428,16 +6468,27 @@ static int emulator_pio_in_out(struct
> > kvm_vcpu *vcpu, int size,
> >   			       unsigned short port, void *val,
> >   			       unsigned int count, bool in)
> >   {
> > +	int ret = 0;
> > +
> >   	vcpu->arch.pio.port = port;
> >   	vcpu->arch.pio.in = in;
> >   	vcpu->arch.pio.count  = count;
> >   	vcpu->arch.pio.size = size;
> >   
> > -	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
> > +	ret = kernel_pio(vcpu, vcpu->arch.pio_data);
> > +	if (!ret) {
> 
> Unnecessary changes.
> 
> 
> >   		vcpu->arch.pio.count = 0;
> >   		return 1;
> >   	}
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (ret == -EINTR) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> > +		return 0;
> > +	}
> > +#endif
> > +
> >   	vcpu->run->exit_reason = KVM_EXIT_IO;
> >   	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN :
> > KVM_EXIT_IO_OUT;
> >   	vcpu->run->io.size = size;
> > @@ -7141,6 +7192,10 @@ static bool retry_instruction(struct
> > x86_emulate_ctxt *ctxt,
> >   
> >   static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
> >   static int complete_emulated_pio(struct kvm_vcpu *vcpu);
> > +#ifdef CONFIG_KVM_IOREGION
> > +static int complete_ioregion_io(struct kvm_vcpu *vcpu);
> > +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu);
> > +#endif
> >   
> >   static void kvm_smm_changed(struct kvm_vcpu *vcpu)
> >   {
> > @@ -7405,6 +7460,14 @@ int x86_emulate_instruction(struct kvm_vcpu
> > *vcpu, gpa_t cr2_or_gpa,
> >   		r = 1;
> >   		if (inject_emulated_exception(vcpu))
> >   			return r;
> > +#ifdef CONFIG_KVM_IOREGION
> > +	} else if (vcpu->ioregion_interrupted &&
> > +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> > +		if (vcpu->ioregion_ctx.in)
> > +			writeback = false;
> > +		vcpu->arch.complete_userspace_io =
> > complete_ioregion_io;
> > +		r = 0;
> > +#endif
> >   	} else if (vcpu->arch.pio.count) {
> >   		if (!vcpu->arch.pio.in) {
> >   			/* FIXME: return into emulator if single-
> > stepping.  */
> > @@ -7501,6 +7564,11 @@ static int kvm_fast_pio_out(struct kvm_vcpu
> > *vcpu, int size,
> >   		vcpu->arch.complete_userspace_io =
> >   			complete_fast_pio_out_port_0x7e;
> >   		kvm_skip_emulated_instruction(vcpu);
> > +#ifdef CONFIG_KVM_IOREGION
> > +	} else if (vcpu->ioregion_interrupted &&
> > +		   vcpu->run->exit_reason == KVM_EXIT_INTR) {
> > +		vcpu->arch.complete_userspace_io =
> > complete_ioregion_fast_pio;
> > +#endif
> >   	} else {
> >   		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
> >   		vcpu->arch.complete_userspace_io =
> > complete_fast_pio_out;
> > @@ -7548,6 +7616,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu
> > *vcpu, int size,
> >   		return ret;
> >   	}
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +	if (vcpu->ioregion_interrupted &&
> > +	    vcpu->run->exit_reason == KVM_EXIT_INTR) {
> > +		vcpu->arch.complete_userspace_io =
> > complete_ioregion_fast_pio;
> > +		return 0;
> > +	}
> > +#endif
> >   	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
> >   	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
> >   
> > @@ -9204,6 +9279,101 @@ static int complete_emulated_mmio(struct
> > kvm_vcpu *vcpu)
> >   	return 0;
> >   }
> >   
> > +#ifdef CONFIG_KVM_IOREGION
> > +static void complete_ioregion_access(struct kvm_vcpu *vcpu, gpa_t
> > addr,
> > +				     int len, void *val)
> > +{
> > +	if (vcpu->ioregion_ctx.in)
> > +		vcpu->ioregion_ctx.dev->ops->read(vcpu, vcpu-
> > >ioregion_ctx.dev,
> > +						  addr, len, val);
> > +	else
> > +		vcpu->ioregion_ctx.dev->ops->write(vcpu, vcpu-
> > >ioregion_ctx.dev,
> > +						   addr, len, val);
> 
> Two dumb questions:
> 
> 1) So if the write is interrupted by the signal, we may do twice or
> more 
> write. Can this satisfies the semantics of all type of registers? 

I think it's necessary to call kvm_io_bus_{read, write}() here. If
there is no in-kernel device or ioregion gets deleted KVM needs to
return to userspace with KVM_EXIT_MMIO/KVM_EXIT_IO.

> E.g 
> for the hardware that counts the time of write to a specific register
> etc.
> 2) If the answer is yes, can we simply rewind RIP to re-emulate the 
> instruction?
> 
> 
> > +}
> > +
> > +static int complete_ioregion_mmio(struct kvm_vcpu *vcpu)
> > +{
> > +	struct kvm_mmio_fragment *frag;
> > +	int idx, ret, i, n;
> > +
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +	for (i = vcpu->mmio_cur_fragment; i < vcpu->mmio_nr_fragments;
> > i++) {
> > +		frag = &vcpu->mmio_fragments[i];
> > +		do {
> > +			n = min(8u, frag->len);
> > +			complete_ioregion_access(vcpu, frag->gpa, n,
> > frag->data);
> > +			frag->len -= n;
> > +			frag->data += n;
> > +			frag->gpa += n;
> > +		} while (frag->len);
> > +		vcpu->mmio_cur_fragment++;
> > +	}
> > +
> > +	vcpu->mmio_needed = 0;
> > +	if (!vcpu->ioregion_ctx.in) {
> > +		srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +		return 1;
> > +	}
> > +
> > +	vcpu->mmio_read_completed = 1;
> > +	ret = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +	return ret;
> > +}
> > +
> > +static int complete_ioregion_pio(struct kvm_vcpu *vcpu)
> > +{
> > +	int i, idx, r = 1;
> > +
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +	for (i = vcpu->ioregion_ctx.pio; i < vcpu->arch.pio.count; i++)
> > {
> > +		complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> > +					 vcpu->ioregion_ctx.len,
> > +					 vcpu->ioregion_ctx.val);
> > +		vcpu->ioregion_ctx.val += vcpu->ioregion_ctx.len;
> > +	}
> > +
> > +	if (vcpu->ioregion_ctx.in)
> > +		r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +	vcpu->arch.pio.count = 0;
> > +
> > +	return r;
> > +}
> > +
> > +static int complete_ioregion_fast_pio(struct kvm_vcpu *vcpu)
> > +{
> > +	int idx;
> > +	u64 val;
> > +
> > +	BUG_ON(!vcpu->ioregion_interrupted);
> > +
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +	complete_ioregion_access(vcpu, vcpu->ioregion_ctx.addr,
> > +				 vcpu->ioregion_ctx.len,
> > +				 vcpu->ioregion_ctx.val);
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +
> > +	if (vcpu->ioregion_ctx.in) {
> > +		memcpy(&val, vcpu->ioregion_ctx.val, vcpu-
> > >ioregion_ctx.len);
> > +		kvm_rax_write(vcpu, val);
> > +	}
> > +	vcpu->arch.pio.count = 0;
> > +
> > +	return kvm_skip_emulated_instruction(vcpu);
> > +}
> > +
> > +static int complete_ioregion_io(struct kvm_vcpu *vcpu)
> > +{
> > +	BUG_ON(!vcpu->ioregion_interrupted);
> > +
> > +	if (vcpu->mmio_needed)
> > +		return complete_ioregion_mmio(vcpu);
> > +	if (vcpu->arch.pio.count)
> > +		return complete_ioregion_pio(vcpu);
> > +}
> > +#endif
> > +
> >   static void kvm_save_current_fpu(struct fpu *fpu)
> >   {
> >   	/*
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7cd667dddba9..5cfdecfca6db 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -318,6 +318,19 @@ struct kvm_vcpu {
> >   #endif
> >   	bool preempted;
> >   	bool ready;
> > +#ifdef CONFIG_KVM_IOREGION
> > +	bool ioregion_interrupted;
> > +	struct {
> > +		struct kvm_io_device *dev;
> > +		int pio;
> > +		void *val;
> > +		u8 state;
> 
> Let's document the state machine here.
> 
> 
> > +		u64 addr;
> > +		int len;
> > +		u64 data;
> > +		bool in;
> > +	} ioregion_ctx;
> > +#endif
> >   	struct kvm_vcpu_arch arch;
> >   };
> >   
> > diff --git a/include/uapi/linux/ioregion.h
> > b/include/uapi/linux/ioregion.h
> > new file mode 100644
> > index 000000000000..7898c01f84a1
> > --- /dev/null
> > +++ b/include/uapi/linux/ioregion.h
> > @@ -0,0 +1,32 @@
> > +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> > +#ifndef _UAPI_LINUX_IOREGION_H
> > +#define _UAPI_LINUX_IOREGION_H
> > +
> > +/* Wire protocol */
> > +struct ioregionfd_cmd {
> > +	__u32 info;
> > +	__u32 padding;
> > +	__u64 user_data;
> > +	__u64 offset;
> > +	__u64 data;
> > +};
> > +
> > +struct ioregionfd_resp {
> > +	__u64 data;
> > +	__u8 pad[24];
> > +};
> > +
> > +#define IOREGIONFD_CMD_READ    0
> > +#define IOREGIONFD_CMD_WRITE   1
> > +
> > +#define IOREGIONFD_SIZE_8BIT   0
> > +#define IOREGIONFD_SIZE_16BIT  1
> > +#define IOREGIONFD_SIZE_32BIT  2
> > +#define IOREGIONFD_SIZE_64BIT  3
> > +
> > +#define IOREGIONFD_SIZE_OFFSET 4
> > +#define IOREGIONFD_RESP_OFFSET 6
> > +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> > +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)
> 
> Instead of using macros, why not explicitly define them in struct 
> ioregionfd_cmd instead of using info?
> 
> 
> > +
> > +#endif
> > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> > index 48ff92bca966..da38124e1418 100644
> > --- a/virt/kvm/ioregion.c
> > +++ b/virt/kvm/ioregion.c
> > @@ -3,6 +3,7 @@
> >   #include <linux/fs.h>
> >   #include <kvm/iodev.h>
> >   #include "eventfd.h"
> > +#include <uapi/linux/ioregion.h>
> >   
> >   void
> >   kvm_ioregionfd_init(struct kvm *kvm)
> > @@ -38,18 +39,190 @@ ioregion_release(struct ioregion *p)
> >   	kfree(p);
> >   }
> >   
> > +static bool
> > +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt,
> > int resp,
> > +	 u64 user_data, const void *val)
> > +{
> > +	u64 size = 0;
> > +
> > +	switch (len) {
> > +	case 1:
> > +		size = IOREGIONFD_SIZE_8BIT;
> > +		break;
> > +	case 2:
> > +		size = IOREGIONFD_SIZE_16BIT;
> > +		break;
> > +	case 4:
> > +		size = IOREGIONFD_SIZE_32BIT;
> > +		break;
> > +	case 8:
> > +		size = IOREGIONFD_SIZE_64BIT;
> > +		break;
> > +	default:
> > +		return false;
> > +	}
> > +
> > +	if (val)
> > +		memcpy(&cmd->data, val, len);
> > +	cmd->user_data = user_data;
> > +	cmd->offset = offset;
> > +	cmd->info |= opt;
> > +	cmd->info |= IOREGIONFD_SIZE(size);
> > +	cmd->info |= IOREGIONFD_RESP(resp);
> > +
> > +	return true;
> > +}
> > +
> > +enum {
> > +	SEND_CMD,
> > +	GET_REPLY,
> > +	COMPLETE
> > +};
> > +
> > +static void
> > +ioregion_save_ctx(struct kvm_vcpu *vcpu, struct kvm_io_device
> > *this,
> > +		  bool in, gpa_t addr, int len, u64 data, u8 state,
> > void *val)
> > +{
> > +	vcpu->ioregion_interrupted = true;
> > +
> > +	vcpu->ioregion_ctx.dev = this;
> > +	vcpu->ioregion_ctx.val = val;
> > +	vcpu->ioregion_ctx.state = state;
> > +	vcpu->ioregion_ctx.addr = addr;
> > +	vcpu->ioregion_ctx.len = len;
> > +	vcpu->ioregion_ctx.data = data;
> > +	vcpu->ioregion_ctx.in = in;
> > +}
> > +
> >   static int
> >   ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> >   	      int len, void *val)
> >   {
> > -	return -EOPNOTSUPP;
> > +	struct ioregion *p = to_ioregion(this);
> > +	union {
> > +		struct ioregionfd_cmd cmd;
> > +		struct ioregionfd_resp resp;
> > +	} buf;
> > +	int ret = 0;
> > +	int state = 0;
> 
> Let's use SEND_CMD otherwise it would be hard for the reviewers...
> 
Ok

> 
> > +
> > +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> > +		return -EINVAL;
> > +
> > +	if (unlikely(vcpu->ioregion_interrupted)) {
> > +		vcpu->ioregion_interrupted = false;
> > +
> > +		switch (vcpu->ioregion_ctx.state) {
> > +		case SEND_CMD:
> > +			goto send_cmd;
> > +		case GET_REPLY:
> > +			goto get_repl;
> > +		case COMPLETE:
> 
> I fail to understand under what condition can we reach here?
> 
I was thinking about the case when a signal is received after obtaining
a reply. But it seems it’s unnecessary to consider this.

> 
> > +			memcpy(val, &vcpu->ioregion_ctx.data, len);
> > +			return 0;
> > +		}
> > +	}
> > +
> > +send_cmd:
> > +	memset(&buf, 0, sizeof(buf));
> > +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len,
> > IOREGIONFD_CMD_READ,
> > +		      1, p->user_data, NULL))
> > +		return -EOPNOTSUPP;
> > +
> > +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> > +	state = (ret == sizeof(buf.cmd));
> > +	if (signal_pending(current)) {
> > +		ioregion_save_ctx(vcpu, this, 1, addr, len, 0, state,
> > val);
> > +		return -EINTR;
> > +	}
> > +	if (ret != sizeof(buf.cmd)) {
> > +		ret = (ret < 0) ? ret : -EIO;
> > +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ?
> > -EINVAL : ret;
> > +	}
> > +
> > +get_repl:
> > +	memset(&buf, 0, sizeof(buf));
> > +	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
> > +	state += (ret == sizeof(buf.resp));
> 
> Let's use enum instead of doing tricks like this.
> 
> Thanks
> 
> 
> > +	if (signal_pending(current)) {
> > +		ioregion_save_ctx(vcpu, this, 1, addr, len,
> > buf.resp.data, state, val);
> > +		return -EINTR;
> > +	}
> > +	if (ret != sizeof(buf.resp)) {
> > +		ret = (ret < 0) ? ret : -EIO;
> > +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ?
> > -EINVAL : ret;
> > +	}
> > +
> > +	memcpy(val, &buf.resp.data, len);
> > +
> > +	return 0;
> >   }
> >   
> >   static int
> >   ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
> > gpa_t addr,
> >   		int len, const void *val)
> >   {
> > -	return -EOPNOTSUPP;
> > +	struct ioregion *p = to_ioregion(this);
> > +	union {
> > +		struct ioregionfd_cmd cmd;
> > +		struct ioregionfd_resp resp;
> > +	} buf;
> > +	int ret = 0;
> > +	int state = 0;
> > +
> > +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> > +		return -EINVAL;
> > +
> > +	if (unlikely(vcpu->ioregion_interrupted)) {
> > +		vcpu->ioregion_interrupted = false;
> > +
> > +		switch (vcpu->ioregion_ctx.state) {
> > +		case SEND_CMD:
> > +			goto send_cmd;
> > +		case GET_REPLY:
> > +			if (!p->posted_writes)
> > +				goto get_repl;
> > +			fallthrough;
> > +		case COMPLETE:
> > +			return 0;
> > +		}
> > +	}
> > +
> > +send_cmd:
> > +	memset(&buf, 0, sizeof(buf));
> > +	if (!pack_cmd(&buf.cmd, addr - p->paddr, len,
> > IOREGIONFD_CMD_WRITE,
> > +		      p->posted_writes ? 0 : 1, p->user_data, val))
> > +		return -EOPNOTSUPP;
> > +
> > +	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
> > +	state = (ret == sizeof(buf.cmd));
> > +	if (signal_pending(current)) {
> > +		ioregion_save_ctx(vcpu, this, 0, addr, len,
> > +				  0, state, (void *)val);
> > +		return -EINTR;
> > +	}
> > +	if (ret != sizeof(buf.cmd)) {
> > +		ret = (ret < 0) ? ret : -EIO;
> > +		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ?
> > -EINVAL : ret;
> > +	}
> > +
> > +get_repl:
> > +	if (!p->posted_writes) {
> > +		memset(&buf, 0, sizeof(buf));
> > +		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp),
> > 0);
> > +		state += (ret == sizeof(buf.resp));
> > +		if (signal_pending(current)) {
> > +			ioregion_save_ctx(vcpu, this, 0, addr, len,
> > +					  0, state, (void *)val);
> > +			return -EINTR;
> > +		}
> > +		if (ret != sizeof(buf.resp)) {
> > +			ret = (ret < 0) ? ret : -EIO;
> > +			return (ret == -EAGAIN || ret == -EWOULDBLOCK)
> > ? -EINVAL : ret;
> > +		}
> > +	}
> > +
> > +	return 0;
> >   }
> >   
> >   /*
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 88b92fc3da51..df387857f51f 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -4193,6 +4193,7 @@ static int __kvm_io_bus_write(struct kvm_vcpu
> > *vcpu, struct kvm_io_bus *bus,
> >   			      struct kvm_io_range *range, const void
> > *val)
> >   {
> >   	int idx;
> > +	int ret = 0;
> >   
> >   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
> >   	if (idx < 0)
> > @@ -4200,9 +4201,12 @@ static int __kvm_io_bus_write(struct
> > kvm_vcpu *vcpu, struct kvm_io_bus *bus,
> >   
> >   	while (idx < bus->dev_count &&
> >   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> > -		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev,
> > range->addr,
> > -					range->len, val))
> > +		ret = kvm_iodevice_write(vcpu, bus->range[idx].dev,
> > range->addr,
> > +					 range->len, val);
> > +		if (!ret)
> >   			return idx;
> > +		if (ret < 0 && ret != -EOPNOTSUPP)
> > +			return ret;
> >   		idx++;
> >   	}
> >   
> > @@ -4264,6 +4268,7 @@ static int __kvm_io_bus_read(struct kvm_vcpu
> > *vcpu, struct kvm_io_bus *bus,
> >   			     struct kvm_io_range *range, void *val)
> >   {
> >   	int idx;
> > +	int ret = 0;
> >   
> >   	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
> >   	if (idx < 0)
> > @@ -4271,9 +4276,12 @@ static int __kvm_io_bus_read(struct kvm_vcpu
> > *vcpu, struct kvm_io_bus *bus,
> >   
> >   	while (idx < bus->dev_count &&
> >   		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
> > -		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev,
> > range->addr,
> > -				       range->len, val))
> > +		ret = kvm_iodevice_read(vcpu, bus->range[idx].dev,
> > range->addr,
> > +					range->len, val);
> > +		if (!ret)
> >   			return idx;
> > +		if (ret < 0 && ret != -EOPNOTSUPP)
> > +			return ret;
> >   		idx++;
> >   	}
> >