Hi Cam, I have gone through you latest shared memory patch. I have a few questions and comments. Comment:- + if (ivshmem_enabled) { + ivshmem_init(ivshmem_device); + ram_size += ivshmem_get_size(); + } + In your initial patch this part of the patch is + if (ivshmem_enabled) { + ivshmem_init(ivshmem_device); + phys_ram_size += ivshmem_get_size(); + } I think the phys_ram_size += ivshmem_get_size(); is correct. Question:- You are giving the desired virtual address for mmaping the shared memory object as "s->ivshmem_ptr" which is "phys_ram_base + s->ivshmem_offset". This desired virtual address is nothing but the base virtual address of the memory that you are allocating after incrementing phys_ram_size. So now s->ivshmem_ptr would point to a new set of memory, which is the shared memory region instead of memory allocated through qemu_alloc_physram, which means if pages are allocated for "sh->ivshmem_ptr" virtual address range then those pages can never be addressed again. Correct me if my understanding is wrong. Thx, Venkat -----Original Message----- From: kvm-owner@xxxxxxxxxxxxxxx [mailto:kvm-owner@xxxxxxxxxxxxxxx] On Behalf Of Cam Macdonell Sent: Thursday, May 07, 2009 9:47 PM To: kvm@xxxxxxxxxxxxxxx Cc: Cam Macdonell Subject: [PATCH v2] Shared memory device with interrupt support Support an inter-vm shared memory device that maps a shared-memory object as a PCI device in the guest. This patch also supports interrupts between guest by communicating over a unix domain socket. This patch applies to the qemu-kvm repository. This device now creates a qemu character device and sends 1-bytes messages to trigger interrupts. Writes are trigger by writing to the "Doorbell" register on the shared memory PCI device. The lower 8-bits of the value written to this register are sent as the 1-byte message so different meanings of interrupts can be supported. Interrupts are only supported between 2 VMs currently. One VM must act as the server by adding "server" to the command-line argument. Shared memory devices are created with the following command-line: -ivhshmem <shm object>,<size in MB>,[unix:<path>][,server] Interrupts can also be used between host and guest as well by implementing a listener on the host. Cam --- Makefile.target | 3 + hw/ivshmem.c | 421 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ hw/pc.c | 6 + hw/pc.h | 3 + qemu-options.hx | 14 ++ sysemu.h | 8 + vl.c | 14 ++ 7 files changed, 469 insertions(+), 0 deletions(-) create mode 100644 hw/ivshmem.c diff --git a/Makefile.target b/Makefile.target index b68a689..3190bba 100644 --- a/Makefile.target +++ b/Makefile.target @@ -643,6 +643,9 @@ OBJS += pcnet.o OBJS += rtl8139.o OBJS += e1000.o +# Inter-VM PCI shared memory +OBJS += ivshmem.o + # Generic watchdog support and some watchdog devices OBJS += watchdog.o OBJS += wdt_ib700.o wdt_i6300esb.o diff --git a/hw/ivshmem.c b/hw/ivshmem.c new file mode 100644 index 0000000..95e2268 --- /dev/null +++ b/hw/ivshmem.c @@ -0,0 +1,421 @@ +/* + * Inter-VM Shared Memory PCI device. + * + * Author: + * Cam Macdonell <cam@xxxxxxxxxxxxxx> + * + * Based On: cirrus_vga.c and rtl8139.c + * + * This code is licensed under the GNU GPL v2. + */ + +#include "hw.h" +#include "console.h" +#include "pc.h" +#include "pci.h" +#include "sysemu.h" + +#include "qemu-common.h" +#include <sys/mman.h> + +#define PCI_COMMAND_IOACCESS 0x0001 +#define PCI_COMMAND_MEMACCESS 0x0002 +#define PCI_COMMAND_BUSMASTER 0x0004 + +//#define DEBUG_IVSHMEM + +#ifdef DEBUG_IVSHMEM +#define IVSHMEM_DPRINTF(fmt, args...) \ + do {printf("IVSHMEM: " fmt, ##args); } while (0) +#else +#define IVSHMEM_DPRINTF(fmt, args...) +#endif + +typedef struct IVShmemState { + uint16_t intrmask; + uint16_t intrstatus; + uint16_t doorbell; + uint8_t *ivshmem_ptr; + unsigned long ivshmem_offset; + unsigned int ivshmem_size; + unsigned long bios_offset; + unsigned int bios_size; + target_phys_addr_t base_ctrl; + int it_shift; + PCIDevice *pci_dev; + CharDriverState * chr; + unsigned long map_addr; + unsigned long map_end; + int ivshmem_mmio_io_addr; +} IVShmemState; + +typedef struct PCI_IVShmemState { + PCIDevice dev; + IVShmemState ivshmem_state; +} PCI_IVShmemState; + +typedef struct IVShmemDesc { + char name[1024]; + char * chrdev; + int size; +} IVShmemDesc; + + +/* registers for the Inter-VM shared memory device */ +enum ivshmem_registers { + IntrMask = 0, + IntrStatus = 16, + Doorbell = 32 +}; + +static int num_ivshmem_devices = 0; +static IVShmemDesc ivshmem_desc; + +static void ivshmem_map(PCIDevice *pci_dev, int region_num, + uint32_t addr, uint32_t size, int type) +{ + PCI_IVShmemState *d = (PCI_IVShmemState *)pci_dev; + IVShmemState *s = &d->ivshmem_state; + + IVSHMEM_DPRINTF("addr = %u size = %u\n", addr, size); + cpu_register_physical_memory(addr, s->ivshmem_size, s->ivshmem_offset); + +} + +void ivshmem_init(const char * optarg) { + + char * temp; + char * ivshmem_sz; + int size; + + num_ivshmem_devices++; + + /* currently we only support 1 device */ + if (num_ivshmem_devices > MAX_IVSHMEM_DEVICES) { + return; + } + + temp = strdup(optarg); + snprintf(ivshmem_desc.name, 1024, "/%s", strsep(&temp,",")); + ivshmem_sz=strsep(&temp,","); + if (ivshmem_sz != NULL){ + size = atol(ivshmem_sz); + } else { + size = -1; + } + + ivshmem_desc.chrdev = strsep(&temp,"\0"); + + if ( size == -1) { + ivshmem_desc.size = TARGET_PAGE_SIZE; + } else { + ivshmem_desc.size = size*1024*1024; + } + IVSHMEM_DPRINTF("optarg is %s, name is %s, size is %d, chrdev is %s\n", + optarg, ivshmem_desc.name, + ivshmem_desc.size, ivshmem_desc.chrdev); +} + +int ivshmem_get_size(void) { + return ivshmem_desc.size; +} + +/* accessing registers - based on rtl8139 */ +static void ivshmem_update_irq(IVShmemState *s) +{ + int isr; + isr = (s->intrstatus & s->intrmask) & 0xffff; + + /* don't print ISR resets */ + if (isr) { + IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n", + isr ? 1 : 0, s->intrstatus, s->intrmask); + } + + qemu_set_irq(s->pci_dev->irq[0], (isr != 0)); +} + +static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num, + uint32_t addr, uint32_t size, int type) +{ + PCI_IVShmemState *d = (PCI_IVShmemState *)pci_dev; + IVShmemState *s = &d->ivshmem_state; + + cpu_register_physical_memory(addr + 0, 0x100, s->ivshmem_mmio_io_addr); +} + +static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val) +{ + IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val); + + s->intrmask = val; + + ivshmem_update_irq(s); +} + +static uint32_t ivshmem_IntrMask_read(IVShmemState *s) +{ + uint32_t ret = s->intrmask; + + IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret); + + return ret; +} + +static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val) +{ + IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val); + + s->intrstatus = val; + + ivshmem_update_irq(s); + return; +} + +static uint32_t ivshmem_IntrStatus_read(IVShmemState *s) +{ + uint32_t ret = s->intrstatus; + + /* reading ISR clears all interrupts */ + s->intrstatus = 0; + + ivshmem_update_irq(s); + + return ret; +} + +static void ivshmem_io_writew(void *opaque, uint8_t addr, uint32_t val) +{ + IVShmemState *s = opaque; + + IVSHMEM_DPRINTF("writing 0x%x to 0x%lx\n", addr, (unsigned long) opaque); + + addr &= 0xfe; + + switch (addr) + { + case IntrMask: + ivshmem_IntrMask_write(s, val); + break; + + case IntrStatus: + ivshmem_IntrStatus_write(s, val); + break; + + default: + IVSHMEM_DPRINTF("why are we writing 0x%x\n", addr); + } +} + +static void ivshmem_io_writel(void *opaque, uint8_t addr, uint32_t val) +{ + IVSHMEM_DPRINTF("We shouldn't be writing longs\n"); +} + +static void ivshmem_io_writeb(void *opaque, uint8_t addr, uint32_t val) +{ + IVShmemState *s = opaque; + uint8_t writebyte = val & 0xff; //write the lower 8-bits of 'val' + + switch (addr) + { // in future, we will probably want to support more types of doorbells + case Doorbell: + // wake up the other side + qemu_chr_write(s->chr, &writebyte, 1); + IVSHMEM_DPRINTF("Writing to the other side 0x%x\n", writebyte); + break; + default: + IVSHMEM_DPRINTF("Unhandled write (0x%x)\n", addr); + } +} + +static uint32_t ivshmem_io_readw(void *opaque, uint8_t addr) +{ + + IVShmemState *s = opaque; + uint32_t ret; + + switch (addr) + { + case IntrMask: + ret = ivshmem_IntrMask_read(s); + break; + case IntrStatus: + ret = ivshmem_IntrStatus_read(s); + break; + default: + IVSHMEM_DPRINTF("why are we reading 0x%x\n", addr); + ret = 0; + } + + return ret; +} + +static uint32_t ivshmem_io_readl(void *opaque, uint8_t addr) +{ + IVSHMEM_DPRINTF("We shouldn't be reading longs\n"); + return 0; +} + +static uint32_t ivshmem_io_readb(void *opaque, uint8_t addr) +{ + IVSHMEM_DPRINTF("We shouldn't be reading bytes\n"); + + return 0; +} + +static void ivshmem_mmio_writeb(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + ivshmem_io_writeb(opaque, addr & 0xFF, val); +} + +static void ivshmem_mmio_writew(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + ivshmem_io_writew(opaque, addr & 0xFF, val); +} + +static void ivshmem_mmio_writel(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + ivshmem_io_writel(opaque, addr & 0xFF, val); +} + +static uint32_t ivshmem_mmio_readb(void *opaque, target_phys_addr_t addr) +{ + return ivshmem_io_readb(opaque, addr & 0xFF); +} + +static uint32_t ivshmem_mmio_readw(void *opaque, target_phys_addr_t addr) +{ + uint32_t val = ivshmem_io_readw(opaque, addr & 0xFF); + return val; +} + +static uint32_t ivshmem_mmio_readl(void *opaque, target_phys_addr_t addr) +{ + uint32_t val = ivshmem_io_readl(opaque, addr & 0xFF); + return val; +} + +static CPUReadMemoryFunc *ivshmem_mmio_read[3] = { + ivshmem_mmio_readb, + ivshmem_mmio_readw, + ivshmem_mmio_readl, +}; + +static CPUWriteMemoryFunc *ivshmem_mmio_write[3] = { + ivshmem_mmio_writeb, + ivshmem_mmio_writew, + ivshmem_mmio_writel, +}; + +static int ivshmem_can_receive(void * opaque) +{ + return 1; +} + +static void ivshmem_receive(void *opaque, const uint8_t *buf, int size) +{ + IVShmemState *s = opaque; + + ivshmem_IntrStatus_write(s, *buf); + + IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf); +} + +static void ivshmem_event(void *opaque, int event) +{ + IVShmemState *s = opaque; + IVSHMEM_DPRINTF("ivshmem_event %d\n", event); +} + +int pci_ivshmem_init(PCIBus *bus) +{ + PCI_IVShmemState *d; + IVShmemState *s; + uint8_t *pci_conf; + int ivshmem_fd; + + IVSHMEM_DPRINTF("shared file is %s\n", ivshmem_desc.name); + d = (PCI_IVShmemState *)pci_register_device(bus, "kvm_ivshmem", + sizeof(PCI_IVShmemState), + -1, NULL, NULL); + if (!d) { + return -1; + } + + s = &d->ivshmem_state; + + /* allocate shared memory RAM */ + s->ivshmem_offset = qemu_ram_alloc(ivshmem_desc.size); + IVSHMEM_DPRINTF("size is = %d\n", ivshmem_desc.size); + IVSHMEM_DPRINTF("ivshmem ram offset = %ld\n", s->ivshmem_offset); + + s->ivshmem_ptr = qemu_get_ram_ptr(s->ivshmem_offset); + + s->pci_dev = &d->dev; + s->ivshmem_size = ivshmem_desc.size; + + pci_conf = d->dev.config; + pci_conf[0x00] = 0xf4; // Qumranet vendor ID 0x5002 + pci_conf[0x01] = 0x1a; + pci_conf[0x02] = 0x10; + pci_conf[0x03] = 0x11; + pci_conf[0x04] = PCI_COMMAND_IOACCESS | PCI_COMMAND_MEMACCESS; + pci_conf[0x0a] = 0x00; // RAM controller + pci_conf[0x0b] = 0x05; + pci_conf[0x0e] = 0x00; // header_type + + pci_conf[PCI_INTERRUPT_PIN] = 1; // we are going to support interrupts + + /* XXX: ivshmem_desc.size must be a power of two */ + + s->ivshmem_mmio_io_addr = cpu_register_io_memory(0, ivshmem_mmio_read, + ivshmem_mmio_write, s); + + /* region for registers*/ + pci_register_io_region(&d->dev, 0, 0x100, + PCI_ADDRESS_SPACE_MEM, ivshmem_mmio_map); + + /* region for shared memory */ + pci_register_io_region(&d->dev, 1, ivshmem_desc.size, + PCI_ADDRESS_SPACE_MEM, ivshmem_map); + + /* open shared memory file */ + if ((ivshmem_fd = shm_open(ivshmem_desc.name, O_CREAT|O_RDWR, S_IRWXU)) < 0) + { + fprintf(stderr, "kvm_ivshmem: could not open shared file\n"); + exit(-1); + } + + ftruncate(ivshmem_fd, ivshmem_desc.size); + + /* mmap onto PCI device's memory */ + if (mmap(s->ivshmem_ptr, ivshmem_desc.size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, ivshmem_fd, 0) == MAP_FAILED) + { + fprintf(stderr, "kvm_ivshmem: could not mmap shared file\n"); + exit(-1); + } + + IVSHMEM_DPRINTF("shared object mapped to 0x%p\n", s->ivshmem_ptr); + + /* setup character device channel */ + + if (ivshmem_desc.chrdev != NULL) { + char label[32]; + snprintf(label, 32, "ivshmem_chardev"); + s->chr = qemu_chr_open(label, ivshmem_desc.chrdev, NULL); + if (s->chr == NULL) { + fprintf(stderr, "No server listening on %s\n", ivshmem_desc.chrdev); + exit(-1); + } + qemu_chr_add_handlers(s->chr, ivshmem_can_receive, ivshmem_receive, + ivshmem_event, s); + } + + return 0; +} + diff --git a/hw/pc.c b/hw/pc.c index 34a4d25..7d0cff2 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -67,6 +67,8 @@ static PITState *pit; static IOAPICState *ioapic; static PCIDevice *i440fx_state; +extern int ivshmem_enabled; + static void ioport80_write(void *opaque, uint32_t addr, uint32_t data) { } @@ -1040,6 +1042,10 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, } } + if (pci_enabled && ivshmem_enabled) { + pci_ivshmem_init(pci_bus); + } + rtc_state = rtc_init(0x70, i8259[8], 2000); qemu_register_boot_set(pc_boot_set, rtc_state); diff --git a/hw/pc.h b/hw/pc.h index 885c918..0ae0493 100644 --- a/hw/pc.h +++ b/hw/pc.h @@ -185,4 +185,7 @@ void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd); void extboot_init(BlockDriverState *bs, int cmd); +/* ivshmem.c */ +int pci_ivshmem_init(PCIBus *bus); + #endif diff --git a/qemu-options.hx b/qemu-options.hx index 173f458..9ab3e2d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1243,6 +1243,20 @@ The default device is @code{vc} in graphical mode and @code{stdio} in non graphical mode. ETEXI +DEF("ivshmem", HAS_ARG, QEMU_OPTION_ivshmem, \ + "-ivshmem name,size[,unix:path][,server] creates or opens a shared file 'name' of size \ + 'size' (in MB) and exposes it as a PCI device in the guest\n") +STEXI +@item -ivshmem @var{file},@var{size} +Creates a POSIX shared file named @var{file} of size @var{size} and creates a +PCI device of the same size that maps the shared file into the device for guests +to access. The created file on the host is located in /dev/shm/ + +@item unix:@var{path}[,server] +A unix domain socket is used to send and receive interrupts between VMs. The unix domain socket +@var{path} is used for connections. +ETEXI + DEF("pidfile", HAS_ARG, QEMU_OPTION_pidfile, \ "-pidfile file write PID to 'file'\n") STEXI diff --git a/sysemu.h b/sysemu.h index 1f45fd6..862b79e 100644 --- a/sysemu.h +++ b/sysemu.h @@ -217,6 +217,14 @@ extern CharDriverState *parallel_hds[MAX_PARALLEL_PORTS]; extern CharDriverState *virtcon_hds[MAX_VIRTIO_CONSOLES]; +/* inter-VM shared memory devices */ + +#define MAX_IVSHMEM_DEVICES 1 + +extern CharDriverState * ivshmem_chardev; +void ivshmem_init(const char * optarg); +int ivshmem_get_size(void); + #define TFR(expr) do { if ((expr) != -1) break; } while (errno == EINTR) #ifdef NEED_CPU_H diff --git a/vl.c b/vl.c index 0420634..7260fa1 100644 --- a/vl.c +++ b/vl.c @@ -221,6 +221,7 @@ static int rtc_date_offset = -1; /* -1 means no change */ int cirrus_vga_enabled = 1; int std_vga_enabled = 0; int vmsvga_enabled = 0; +int ivshmem_enabled = 0; int xenfb_enabled = 0; #ifdef TARGET_SPARC int graphic_width = 1024; @@ -239,6 +240,8 @@ int no_quit = 0; CharDriverState *serial_hds[MAX_SERIAL_PORTS]; CharDriverState *parallel_hds[MAX_PARALLEL_PORTS]; CharDriverState *virtcon_hds[MAX_VIRTIO_CONSOLES]; +CharDriverState *ivshmem_chardev; +const char * ivshmem_device; #ifdef TARGET_I386 int win2k_install_hack = 0; int rtc_td_hack = 0; @@ -5063,6 +5066,8 @@ int main(int argc, char **argv, char **envp) cyls = heads = secs = 0; translation = BIOS_ATA_TRANSLATION_AUTO; monitor_device = "vc:80Cx24C"; + ivshmem_device = NULL; + ivshmem_chardev = NULL; serial_devices[0] = "vc:80Cx24C"; for(i = 1; i < MAX_SERIAL_PORTS; i++) @@ -5518,6 +5523,10 @@ int main(int argc, char **argv, char **envp) parallel_devices[parallel_device_index] = optarg; parallel_device_index++; break; + case QEMU_OPTION_ivshmem: + ivshmem_device = optarg; + ivshmem_enabled = 1; + break; case QEMU_OPTION_loadvm: loadvm = optarg; break; @@ -5984,6 +5993,11 @@ int main(int argc, char **argv, char **envp) } } + if (ivshmem_enabled) { + ivshmem_init(ivshmem_device); + ram_size += ivshmem_get_size(); + } + #ifdef CONFIG_KQEMU /* FIXME: This is a nasty hack because kqemu can't cope with dynamic guest ram allocation. It needs to go away. */ -- 1.6.0.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html