Support an inter-vm shared memory device that maps a shared-memory object as a PCI device in the guest. This patch also supports interrupts between guest by communicating over a unix domain socket. This patch applies to the qemu-kvm repository. -device ivshmem,size=<size in MB>[,shm=<shm name>] Interrupts are supported between multiple VMs by using a shared memory server by using a chardev socket. -device ivshmem,size=<size in MB>[,shm=<shm name>][,chardev=<id>][,msi=on] [,irqfd=on][,vectors=n] -chardev socket,path=<path>,id=<id> Sample programs, init scripts and the shared memory server are available in a git repo here: www.gitorious.org/nahanni --- Makefile.target | 3 + hw/ivshmem.c | 700 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ qemu-char.c | 6 + qemu-char.h | 3 + 4 files changed, 712 insertions(+), 0 deletions(-) create mode 100644 hw/ivshmem.c diff --git a/Makefile.target b/Makefile.target index 1ffd802..bc9a681 100644 --- a/Makefile.target +++ b/Makefile.target @@ -199,6 +199,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o obj-y += rtl8139.o obj-y += e1000.o +# Inter-VM PCI shared memory +obj-y += ivshmem.o + # Hardware support obj-i386-y = pckbd.o dma.o obj-i386-y += vga.o diff --git a/hw/ivshmem.c b/hw/ivshmem.c new file mode 100644 index 0000000..2ec6c2c --- /dev/null +++ b/hw/ivshmem.c @@ -0,0 +1,700 @@ +/* + * Inter-VM Shared Memory PCI device. + * + * Author: + * Cam Macdonell <cam@xxxxxxxxxxxxxx> + * + * Based On: cirrus_vga.c and rtl8139.c + * + * This code is licensed under the GNU GPL v2. + */ +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/io.h> +#include <sys/ioctl.h> +#include <sys/eventfd.h> +#include "hw.h" +#include "console.h" +#include "pc.h" +#include "pci.h" +#include "sysemu.h" + +#include "msix.h" +#include "qemu-kvm.h" +#include "libkvm.h" + +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/ioctl.h> + +#define PCI_COMMAND_IOACCESS 0x0001 +#define PCI_COMMAND_MEMACCESS 0x0002 + +#define DEBUG_IVSHMEM + +#define IVSHMEM_IRQFD 0 +#define IVSHMEM_MSI 1 +#define IVSHMEM_MAX_EVENTFDS 16 + +#ifdef DEBUG_IVSHMEM +#define IVSHMEM_DPRINTF(fmt, args...) \ + do {printf("IVSHMEM: " fmt, ##args); } while (0) +#else +#define IVSHMEM_DPRINTF(fmt, args...) +#endif + +#define NEW_GUEST_VAL UINT_MAX + +struct eventfd_entry { + PCIDevice *pdev; + int vector; +}; + +typedef struct IVShmemState { + PCIDevice dev; + uint32_t intrmask; + uint32_t intrstatus; + uint32_t doorbell; + + CharDriverState * chr; + CharDriverState ** eventfd_chr; + int ivshmem_mmio_io_addr; + + pcibus_t mmio_addr; + uint8_t *ivshmem_ptr; + unsigned long ivshmem_offset; + unsigned int ivshmem_size; + int shm_fd; /* shared memory file descriptor */ + + /* array of eventfds for each guest */ + int * eventfds[IVSHMEM_MAX_EVENTFDS]; + /* keep track of # of eventfds for each guest*/ + int * eventfds_posn_count; + + int vm_id; + int num_eventfds; + uint32_t vectors; + uint32_t features; + struct eventfd_entry eventfd_table[IVSHMEM_MAX_EVENTFDS]; + + char * shmobj; + uint32_t size; /*size of shared memory in MB*/ +} IVShmemState; + +/* registers for the Inter-VM shared memory device */ +enum ivshmem_registers { + IntrMask = 0, + IntrStatus = 4, + IVPosition = 8, + Doorbell = 12, +}; + +static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, int feature) { + return (ivs->features & (1 << feature)); +} + +static inline int is_power_of_two(int x) { + return (x & (x-1)) == 0; +} + +static void ivshmem_map(PCIDevice *pci_dev, int region_num, + pcibus_t addr, pcibus_t size, int type) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev); + + IVSHMEM_DPRINTF("addr = %u size = %u\n", (uint32_t)addr, (uint32_t)size); + cpu_register_physical_memory(addr, s->ivshmem_size, s->ivshmem_offset); + +} + +/* accessing registers - based on rtl8139 */ +static void ivshmem_update_irq(IVShmemState *s, int val) +{ + int isr; + isr = (s->intrstatus & s->intrmask) & 0xffffffff; + + /* don't print ISR resets */ + if (isr) { + IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n", + isr ? 1 : 0, s->intrstatus, s->intrmask); + } + + qemu_set_irq(s->dev.irq[0], (isr != 0)); +} + +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val) +{ + IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val); + + s->intrmask = val; + + ivshmem_update_irq(s, val); +} + +static uint32_t ivshmem_IntrMask_read(IVShmemState *s) +{ + uint32_t ret = s->intrmask; + + IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret); + + return ret; +} + +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val) +{ + IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val); + + s->intrstatus = val; + + ivshmem_update_irq(s, val); + return; +} + +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s) +{ + uint32_t ret = s->intrstatus; + + /* reading ISR clears all interrupts */ + s->intrstatus = 0; + + ivshmem_update_irq(s, 0); + + return ret; +} + +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val) +{ + + IVSHMEM_DPRINTF("We shouldn't be writing words\n"); +} + +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val) +{ + IVShmemState *s = opaque; + + u_int64_t write_one = 1; + u_int16_t dest = val >> 16; + u_int16_t vector = val & 0xff; + + addr &= 0xfe; + + switch (addr) + { + case IntrMask: + ivshmem_IntrMask_write(s, val); + break; + + case IntrStatus: + ivshmem_IntrStatus_write(s, val); + break; + + case Doorbell: + /* check doorbell range */ + IVSHMEM_DPRINTF("Writing %ld to VM %d on vector %d\n", write_one, dest, vector); + if ((vector > 0) && (vector < s->eventfds_posn_count[dest])) { + if (write(s->eventfds[dest][vector], &(write_one), 8) != 8) { + IVSHMEM_DPRINTF("error writing to eventfd\n"); + } + } + break; + default: + IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest); + } +} + +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val) +{ + IVSHMEM_DPRINTF("We shouldn't be writing bytes\n"); +} + +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr) +{ + + IVSHMEM_DPRINTF("We shouldn't be reading words\n"); + return 0; +} + +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr) +{ + + IVShmemState *s = opaque; + uint32_t ret; + + switch (addr) + { + case IntrMask: + ret = ivshmem_IntrMask_read(s); + break; + + case IntrStatus: + ret = ivshmem_IntrStatus_read(s); + break; + + case IVPosition: + /* return my id in the ivshmem list */ + ret = s->vm_id; + break; + + default: + IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr); + ret = 0; + } + + return ret; + +} + +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr) +{ + IVSHMEM_DPRINTF("We shouldn't be reading bytes\n"); + + return 0; +} + +static void ivshmem_mmio_writeb(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + ivshmem_io_writeb(opaque, addr & 0xFF, val); +} + +static void ivshmem_mmio_writew(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + ivshmem_io_writew(opaque, addr & 0xFF, val); +} + +static void ivshmem_mmio_writel(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + ivshmem_io_writel(opaque, addr & 0xFF, val); +} + +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr) +{ + return ivshmem_io_readb(opaque, addr & 0xFF); +} + +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr) +{ + uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF); + return val; +} + +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr) +{ + uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF); + return val; +} + +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = { + ivshmem_mmio_readb, + ivshmem_mmio_readw, + ivshmem_mmio_readl, +}; + +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = { + ivshmem_mmio_writeb, + ivshmem_mmio_writew, + ivshmem_mmio_writel, +}; + +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size) +{ + IVShmemState *s = opaque; + + ivshmem_IntrStatus_write(s, *buf); + + IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf); +} + +static int ivshmem_can_receive(void * opaque) +{ + return 8; +} + +static void ivshmem_event(void *opaque, int event) +{ +// IVShmemState *s = opaque; + IVSHMEM_DPRINTF("ivshmem_event %d\n", event); +} + +static void fake_irqfd(void *opaque, const uint8_t *buf, int size) { + + struct eventfd_entry *entry = opaque; + PCIDevice *pdev = entry->pdev; + + IVSHMEM_DPRINTF("fake irqfd on vector %d\n", entry->vector); + msix_notify(pdev, entry->vector); +} + +static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd, + int vector) +{ + // create a event character device based on the passed eventfd + IVShmemState *s = opaque; + CharDriverState * chr; + + chr = qemu_chr_open_eventfd(eventfd); + + if (chr == NULL) { + IVSHMEM_DPRINTF("creating eventfd for eventfd %d failed\n", eventfd); + exit(-1); + } + + if (ivshmem_has_feature(s, IVSHMEM_MSI)) { + s->eventfd_table[vector].pdev = &s->dev; + s->eventfd_table[vector].vector = vector; + + qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd, + ivshmem_event, &s->eventfd_table[vector]); + } else { + qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive, + ivshmem_event, s); + } + + return chr; + +} + +static int check_shm_size(IVShmemState *s, int shmemfd) { + /* check that the guest isn't going to try and map more memory than the + * card server allocated return -1 to indicate error */ + + struct stat buf; + + fstat(shmemfd, &buf); + + if (s->ivshmem_size > buf.st_size) { + fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater"); + fprintf(stderr, " than shared object size (%d > %ld)\n", + s->ivshmem_size, buf.st_size); + return -1; + } else { + return 0; + } +} + +static void create_shared_memory_BAR(IVShmemState *s, int fd) { + + s->shm_fd = fd; + + s->ivshmem_offset = qemu_ram_mmap(s->shm_fd, s->ivshmem_size, + MAP_SHARED, 0); + + s->ivshmem_ptr = qemu_get_ram_ptr(s->ivshmem_offset); + + /* region for shared memory */ + pci_register_bar(&s->dev, 2, s->ivshmem_size, + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map); +} + +static int ivshmem_irqfd(PCIDevice* pdev, uint16_t vector, int fd) +{ + struct kvm_irqfd call = { }; + int r; + + IVSHMEM_DPRINTF("inside irqfd\n"); + if (vector >= pdev->msix_entries_nr) + return -EINVAL; + call.fd = fd; + call.gsi = pdev->msix_irq_entries[vector].gsi; + r = kvm_vm_ioctl(kvm_state, KVM_IRQFD, &call); + if (r < 0) { + IVSHMEM_DPRINTF("allocating irqfd failed %d\n", r); + return r; + } + return 0; +} + +static int ivshmem_ioeventfd(IVShmemState* s, int posn, int fd, int vector) +{ + + int ret; + struct kvm_ioeventfd iofd; + + iofd.datamatch = (posn << 16) | vector; + iofd.addr = s->mmio_addr + Doorbell; + iofd.len = 4; + iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH; + iofd.fd = fd; + + ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); + + if (ret < 0) { + fprintf(stderr, "error assigning ioeventfd (%d)\n", ret); + perror(strerror(ret)); + } else { + IVSHMEM_DPRINTF("success assigning ioeventfd (%d:%d)\n", posn, vector); + } + + return ret; +} +/* notify that a new guest has joined */ +static void new_guest_interrupt(IVShmemState *s) +{ + if (msix_enabled(&s->dev)) { + msix_notify(&s->dev, 0); + } else { + ivshmem_IntrStatus_write(s, NEW_GUEST_VAL); + } +} + +static void close_guest_eventfds(IVShmemState *s, int posn) +{ + int i, guest_curr_max; + + guest_curr_max = s->eventfds_posn_count[posn]; + + for (i = 0; i < guest_curr_max; i++) + close(s->eventfds[posn][i]); + + free(s->eventfds[posn]); + s->eventfds_posn_count[posn] = 0; +} + +static void ivshmem_read(void *opaque, const uint8_t * buf, int flags) +{ + IVShmemState *s = opaque; + int incoming_fd, tmp_fd; + int guest_curr_max; + long incoming_posn; + + memcpy(&incoming_posn, buf, sizeof(long)); + /* pick off s->chr->msgfd and store it, posn should accompany msg */ + tmp_fd = qemu_chr_get_msgfd(s->chr); + IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd); + + if (tmp_fd == -1) { + /* if posn is positive and unseen before then this is our posn*/ + if ((incoming_posn >= 0) && (s->eventfds[incoming_posn] == NULL)) { + /* receive our posn */ + s->vm_id = incoming_posn; + return; + } else { + /* otherwise an fd == -1 means an existing guest has gone away */ + IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn); + close_guest_eventfds(s, incoming_posn); + return; + } + } + + /* because of the implementation of get_msgfd, we need a dup */ + incoming_fd = dup(tmp_fd); + + /* if the position is -1, then it's shared memory region fd */ + if (incoming_posn == -1) { + + s->num_eventfds = 0; + + if (check_shm_size(s, incoming_fd) == -1) { + exit(-1); + } + + /* creating a BAR in qemu_chr callback may be crazy */ + create_shared_memory_BAR(s, incoming_fd); + + return; + } + + /* each guest has an array of eventfds, and we keep track of how many + * guests for each VM */ + guest_curr_max = s->eventfds_posn_count[incoming_posn]; + if (guest_curr_max == 0) { + s->eventfds[incoming_posn] = (int *) malloc(IVSHMEM_MAX_EVENTFDS * + sizeof(int)); + new_guest_interrupt(s); + } + + /* this is an eventfd for a particular guest VM */ + IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn, guest_curr_max, + incoming_fd); + s->eventfds[incoming_posn][guest_curr_max] = incoming_fd; + + /* increment count for particular guest */ + s->eventfds_posn_count[incoming_posn]++; + + /* ioeventfd and irqfd are enabled together, + * so the flag IRQFD refers to both */ + if (ivshmem_has_feature(s, IVSHMEM_IRQFD) && guest_curr_max > 0) { + /* allocate ioeventfd for the new fd + * received for guest @ incoming_posn */ + ivshmem_ioeventfd(s, incoming_posn, incoming_fd, guest_curr_max); + } + + /* keep track of the maximum VM ID */ + if (incoming_posn > s->num_eventfds) { + s->num_eventfds = incoming_posn; + } + + if (incoming_posn == s->vm_id) { + if (ivshmem_has_feature(s, IVSHMEM_IRQFD)) { + /* setup irqfd for this VM's eventfd */ + ivshmem_irqfd(&s->dev, guest_curr_max, + s->eventfds[s->vm_id][guest_curr_max]); + } else { + /* initialize char device for callback + * if this is one of my eventfd */ + s->eventfd_chr[guest_curr_max] = create_eventfd_chr_device(s, + s->eventfds[s->vm_id][guest_curr_max], guest_curr_max); + } + } + + return; +} + +static void ivshmem_reset(DeviceState *d) +{ + return; +} + +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num, + pcibus_t addr, pcibus_t size, int type) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev); + + s->mmio_addr = addr; + cpu_register_physical_memory(addr + 0, 0x400, s->ivshmem_mmio_io_addr); + + /* now that our mmio region has been allocated, we can receive + * the file descriptors */ + qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_read, + ivshmem_event, s); + +} + +static int pci_ivshmem_init(PCIDevice *dev) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev); + uint8_t *pci_conf; + int i; + + /* BARs must be a power of 2 */ + if (is_power_of_two(s->size)) + s->ivshmem_size = s->size * 1024* 1024; + else { + fprintf(stderr, "ivshmem: size must be power of 2\n"); + exit(1); + } + + /* IRQFD requires MSI */ + if (ivshmem_has_feature(s, IVSHMEM_IRQFD) && + !ivshmem_has_feature(s, IVSHMEM_MSI)) { + fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n"); + exit(1); + } + + pci_conf = s->dev.config; + pci_conf[0x00] = 0xf4; // Qumranet vendor ID 0x5002 + pci_conf[0x01] = 0x1a; + pci_conf[0x02] = 0x10; + pci_conf[0x03] = 0x11; + pci_conf[0x04] = PCI_COMMAND_IOACCESS | PCI_COMMAND_MEMACCESS; + pci_conf[0x0a] = 0x00; // RAM controller + pci_conf[0x0b] = 0x05; + pci_conf[0x0e] = 0x00; // header_type + + s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read, + ivshmem_mmio_write, s); + /* region for registers*/ + pci_register_bar(&s->dev, 0, 0x400, + PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map); + + /* allocate the MSI-X vectors */ + if (ivshmem_has_feature(s, IVSHMEM_MSI)) { + + if (!msix_init(&s->dev, s->vectors, 1, 0)) { + pci_register_bar(&s->dev, 1, + msix_bar_size(&s->dev), + PCI_BASE_ADDRESS_SPACE_MEMORY, + msix_mmio_map); + IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors); + } else { + IVSHMEM_DPRINTF("msix initialization failed\n"); + } + + /* 'activate' the vectors */ + for (i = 0; i < s->vectors; i++) { + msix_vector_use(&s->dev, i); + } + } + + if ((s->chr != NULL) && (strncmp(s->chr->filename, "unix:", 5) == 0)) { + /* if we get a UNIX socket as the parameter we will talk + * to the ivshmem server later once the MMIO BAR is actually + * allocated (see ivshmem_mmio_map) */ + + s->eventfds_posn_count = qemu_mallocz(IVSHMEM_MAX_EVENTFDS * + sizeof(int)); + + IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n", + s->chr->filename); + + s->vm_id = -1; + + } else { + /* just map the file immediately, we're not using a server */ + int fd; + + IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj); + + if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR, + S_IRWXU|S_IRWXG|S_IRWXO)) < 0) { + fprintf(stderr, "kvm_ivshmem: could not open shared file\n"); + exit(-1); + } + + /* mmap onto PCI device's memory */ + if (ftruncate(fd, s->ivshmem_size) != 0) { + fprintf(stderr, "kvm_ivshmem: could not truncate shared file\n"); + } + + create_shared_memory_BAR(s, fd); + + } + + IVSHMEM_DPRINTF("shared memory size is = %d\n", s->size); + + pci_conf[PCI_INTERRUPT_PIN] = 1; // we are going to support interrupts + + if (!ivshmem_has_feature(s, IVSHMEM_IRQFD)) { + s->eventfd_chr = (CharDriverState **)malloc(IVSHMEM_MAX_EVENTFDS * + sizeof(void *)); + } + + return 0; +} + +static int pci_ivshmem_uninit(PCIDevice *dev) +{ + IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev); + + cpu_unregister_io_memory(s->ivshmem_mmio_io_addr); + + return 0; +} + +static PCIDeviceInfo ivshmem_info = { + .qdev.name = "ivshmem", + .qdev.size = sizeof(IVShmemState), + .qdev.reset = ivshmem_reset, + .init = pci_ivshmem_init, + .exit = pci_ivshmem_uninit, + .qdev.props = (Property[]) { + DEFINE_PROP_CHR("chardev", IVShmemState, chr), + DEFINE_PROP_UINT32("size", IVShmemState, size, 0), + DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 2), + DEFINE_PROP_BIT("irqfd", IVShmemState, features, IVSHMEM_IRQFD, false), + DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true), + DEFINE_PROP_STRING("shm", IVShmemState, shmobj), + DEFINE_PROP_END_OF_LIST(), + } +}; + +static void ivshmem_register_devices(void) +{ + pci_qdev_register(&ivshmem_info); +} + +device_init(ivshmem_register_devices) diff --git a/qemu-char.c b/qemu-char.c index 048da3f..41cb8c7 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -2076,6 +2076,12 @@ static void tcp_chr_read(void *opaque) } } +CharDriverState *qemu_chr_open_eventfd(int eventfd){ + + return qemu_chr_open_fd(eventfd, eventfd); + +} + static void tcp_chr_connect(void *opaque) { CharDriverState *chr = opaque; diff --git a/qemu-char.h b/qemu-char.h index 3a9427b..1571091 100644 --- a/qemu-char.h +++ b/qemu-char.h @@ -93,6 +93,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data); void qemu_chr_info(Monitor *mon, QObject **ret_data); CharDriverState *qemu_chr_find(const char *name); +/* add an eventfd to the qemu devices that are polled */ +CharDriverState *qemu_chr_open_eventfd(int eventfd); + extern int term_escape_char; /* async I/O support */ -- 1.6.0.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html