Re: [RFC 2/2] KVM: add initial support for ioregionfd blocking read/write operations

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Dec 29, 2020 at 01:02:44PM +0300, Elena Afanasova wrote:
> Signed-off-by: Elena Afanasova <eafanasova@xxxxxxxxx>
> ---
>  virt/kvm/ioregion.c | 157 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 157 insertions(+)
> 
> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
> index a200c3761343..8523f4126337 100644
> --- a/virt/kvm/ioregion.c
> +++ b/virt/kvm/ioregion.c
> @@ -4,6 +4,33 @@
>  #include <kvm/iodev.h>
>  #include "eventfd.h"
>  
> +/* Wire protocol */
> +struct ioregionfd_cmd {
> +	__u32 info;
> +	__u32 padding;
> +	__u64 user_data;
> +	__u64 offset;
> +	__u64 data;
> +};
> +
> +struct ioregionfd_resp {
> +	__u64 data;
> +	__u8 pad[24];
> +};
> +
> +#define IOREGIONFD_CMD_READ    0
> +#define IOREGIONFD_CMD_WRITE   1
> +
> +#define IOREGIONFD_SIZE_8BIT   0
> +#define IOREGIONFD_SIZE_16BIT  1
> +#define IOREGIONFD_SIZE_32BIT  2
> +#define IOREGIONFD_SIZE_64BIT  3
> +
> +#define IOREGIONFD_SIZE_OFFSET 4
> +#define IOREGIONFD_RESP_OFFSET 6
> +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET)
> +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET)

These belong in the uapi header so userspace also has struct
ioregionfd_cmd, struct ioregionfd_resp, etc.

> +
>  void
>  kvm_ioregionfd_init(struct kvm *kvm)
>  {
> @@ -38,10 +65,100 @@ ioregion_release(struct ioregion *p)
>  	kfree(p);
>  }
>  
> +static bool
> +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, bool resp,
> +	 u64 user_data, const void *val)
> +{
> +	u64 size = 0;
> +
> +	switch (len) {
> +	case 1:
> +		size = IOREGIONFD_SIZE_8BIT;
> +		*((u8 *)&cmd->data) = val ? *(u8 *)val : 0;
> +		break;
> +	case 2:
> +		size = IOREGIONFD_SIZE_16BIT;
> +		*((u16 *)&cmd->data) = val ? *(u16 *)val : 0;
> +		break;
> +	case 4:
> +		size = IOREGIONFD_SIZE_32BIT;
> +		*((u32 *)&cmd->data) = val ? *(u32 *)val : 0;
> +		break;
> +	case 8:
> +		size = IOREGIONFD_SIZE_64BIT;
> +		*((u64 *)&cmd->data) = val ? *(u64 *)val : 0;
> +		break;
> +	default:
> +		return false;
>

The assignments and casts can be replaced with a single memcpy after the
switch statement. This is also how KVM_EXIT_MMIO and Coalesced MMIO do
it:

  memcpy(cmd->data, val, len);

However, we need to make sure that cmd has been zeroed so that kernel
memory is not accidentally exposed to userspace.

 +	}
> +	cmd->user_data = user_data;
> +	cmd->offset = offset;
> +	cmd->info |= opt;
> +	cmd->info |= IOREGIONFD_SIZE(size);
> +	cmd->info |= IOREGIONFD_RESP(resp);
> +
> +	return true;
> +}
> +
>  static int
>  ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
>  	      int len, void *val)
>  {
> +	struct ioregion *p = to_ioregion(this);
> +	struct ioregionfd_cmd *cmd;
> +	struct ioregionfd_resp *resp;
> +	size_t buf_size;
> +	void *buf;
> +	int ret = 0;
> +
> +	if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & O_NONBLOCK))
> +		return -EINVAL;

Another CPU could change file descriptor flags while we are running.
Therefore it might be simplest to handle kernel_write() and
kernel_read() -EAGAIN return values instead of trying to check.

> +	if ((addr + len - 1) > (p->paddr + p->size - 1))
> +		return -EINVAL;
> +
> +	buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp));
> +	buf = kzalloc(buf_size, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +	cmd = (struct ioregionfd_cmd *)buf;
> +	resp = (struct ioregionfd_resp *)buf;

I think they are small enough to declare them on the stack:

  union {
      struct ioregionfd_cmd cmd;
      struct ioregionfd_resp resp;
  } buf;

  memset(&buf, 0, sizeof(buf));

> +	if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
> +		      1, p->user_data, NULL)) {
> +		kfree(buf);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0);
> +	if (ret != sizeof(*cmd)) {
> +		kfree(buf);
> +		return (ret < 0) ? ret : -EIO;
> +	}
> +	memset(buf, 0, buf_size);
> +	ret = kernel_read(p->rf, resp, sizeof(*resp), 0);
> +	if (ret != sizeof(*resp)) {
> +		kfree(buf);
> +		return (ret < 0) ? ret : -EIO;
> +	}
> +
> +	switch (len) {
> +	case 1:
> +		*(u8 *)val = (u8)resp->data;
> +		break;
> +	case 2:
> +		*(u16 *)val = (u16)resp->data;
> +		break;
> +	case 4:
> +		*(u32 *)val = (u32)resp->data;
> +		break;
> +	case 8:
> +		*(u64 *)val = (u64)resp->data;
> +		break;
> +	default:
> +		break;
> +	}

This looks inconsistent. cmd->data is treated as a packed u8/u16/u32/864
whereas resp->data is treated as u64?

I was expecting memcpy(val, &resp->data, len) here not the u64 ->
u8/u16/u32/u64 conversion.

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux