Re: [PATCH] kvm: First step to push iothread lock out of inner run loop

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> Should have declared this [RFC] in the subject and CC'ed kvm...
> 
> On 2012-06-23 00:45, Jan Kiszka wrote:
> > This sketches a possible path to get rid of the iothread lock on vmexits
> > in KVM mode. On x86, the the in-kernel irqchips has to be used because
> > we otherwise need to synchronize APIC and other per-cpu state accesses
> > that could be changed concurrently. Not yet fully analyzed is the NMI
> > injection path in the absence of an APIC.
> > 
> > s390x should be fine without specific locking as their pre/post-run
> > callbacks are empty. Power requires locking for the pre-run callback.
> > 
> > This patch is untested, but a similar version was successfully used in
> > a x86 setup with a network I/O path that needed no central iothread
> > locking anymore (required special MMIO exit handling).
> > ---
> >  kvm-all.c         |   18 ++++++++++++++++--
> >  target-i386/kvm.c |    7 +++++++
> >  target-ppc/kvm.c  |    4 ++++
> >  3 files changed, 27 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kvm-all.c b/kvm-all.c
> > index f8e4328..9c3e26f 100644
> > --- a/kvm-all.c
> > +++ b/kvm-all.c
> > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
> >          return EXCP_HLT;
> >      }
> >  
> > +    qemu_mutex_unlock_iothread();
> > +
> >      do {
> >          if (env->kvm_vcpu_dirty) {
> >              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
> >               */
> >              qemu_cpu_kick_self();
> >          }
> > -        qemu_mutex_unlock_iothread();
> >  
> >          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
> >  
> > -        qemu_mutex_lock_iothread();
> >          kvm_arch_post_run(env, run);
> >  
> > +        /* TODO: push coalesced mmio flushing to the point where we access
> > +         * devices that are using it (currently VGA and E1000). */
> > +        qemu_mutex_lock_iothread();
> >          kvm_flush_coalesced_mmio_buffer();
> > +        qemu_mutex_unlock_iothread();
> >  
> >          if (run_ret < 0) {
> >              if (run_ret == -EINTR || run_ret == -EAGAIN) {
> > @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
> >          switch (run->exit_reason) {
> >          case KVM_EXIT_IO:
> >              DPRINTF("handle_io\n");
> > +            qemu_mutex_lock_iothread();
> >              kvm_handle_io(run->io.port,
> >                            (uint8_t *)run + run->io.data_offset,
> >                            run->io.direction,
> >                            run->io.size,
> >                            run->io.count);
> > +            qemu_mutex_unlock_iothread();
> >              ret = 0;
> >              break;
> >          case KVM_EXIT_MMIO:
> >              DPRINTF("handle_mmio\n");
> > +            qemu_mutex_lock_iothread();
> >              cpu_physical_memory_rw(run->mmio.phys_addr,
> >                                     run->mmio.data,
> >                                     run->mmio.len,
> >                                     run->mmio.is_write);
> > +            qemu_mutex_unlock_iothread();
> >              ret = 0;
> >              break;
> >          case KVM_EXIT_IRQ_WINDOW_OPEN:
> > @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
> >              break;
> >          case KVM_EXIT_SHUTDOWN:
> >              DPRINTF("shutdown\n");
> > +            qemu_mutex_lock_iothread();
> >              qemu_system_reset_request();
> > +            qemu_mutex_unlock_iothread();
> >              ret = EXCP_INTERRUPT;
> >              break;
> >          case KVM_EXIT_UNKNOWN:
> > @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
> >              break;
> >          default:
> >              DPRINTF("kvm_arch_handle_exit\n");
> > +            qemu_mutex_lock_iothread();
> >              ret = kvm_arch_handle_exit(env, run);
> > +            qemu_mutex_unlock_iothread();
> >              break;
> >          }
> >      } while (ret == 0);
> >  
> > +    qemu_mutex_lock_iothread();
> > +
> >      if (ret < 0) {
> >          cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
> >          vm_stop(RUN_STATE_INTERNAL_ERROR);
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index 0d0d8f6..0ad64d1 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >  
> >      /* Inject NMI */
> >      if (env->interrupt_request & CPU_INTERRUPT_NMI) {
> > +        qemu_mutex_lock_iothread();
> >          env->interrupt_request &= ~CPU_INTERRUPT_NMI;
> > +        qemu_mutex_unlock_iothread();
> > +
> >          DPRINTF("injected NMI\n");
> >          ret = kvm_vcpu_ioctl(env, KVM_NMI);
> >          if (ret < 0) {
> > @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >      }
> >  
> >      if (!kvm_irqchip_in_kernel()) {
> > +        qemu_mutex_lock_iothread();
> > +
> >          /* Force the VCPU out of its inner loop to process any INIT requests
> >           * or pending TPR access reports. */
> >          if (env->interrupt_request &
> > @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >  
> >          DPRINTF("setting tpr\n");
> >          run->cr8 = cpu_get_apic_tpr(env->apic_state);
> > +
> > +        qemu_mutex_unlock_iothread();
> >      }
> >  }
> >  
> > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> > index c09cc39..60d91a5 100644
> > --- a/target-ppc/kvm.c
> > +++ b/target-ppc/kvm.c
> > @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
> >      int r;
> >      unsigned irq;
> >  
> > +    qemu_mutex_lock_iothread();
> > +
> >      /* PowerPC QEMU tracks the various core input pins (interrupt, critical
> >       * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
> >      if (!cap_interrupt_level &&
> > @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
> >      /* We don't know if there are more interrupts pending after this. However,
> >       * the guest will return to userspace in the course of handling this one
> >       * anyways, so we will get a chance to deliver the rest. */
> > +
> > +    qemu_mutex_unlock_iothread();
> >  }
> >  
> >  void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
> > 

The following plan would allow progressive convertion to parallel
operation.

Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.

Jan: if there is recursive MMIO accesses, you can detect that and skip
such MMIO handlers in dev_can_use_lock() ? Or blacklist.

What i mentioned earlier about "unsolved issues" are the possibilities
in "iothread flow" (belief is that it would be better determined at
with execution experience / tuning).

OK, so, each document starts with xyz.txt. This is compatible with TCG, 
the critical part is not dropping/acquire locks for decent performance
(that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD 
requests so).

This came out from discussions with Avi.

immediate-plan.txt
------------------

1.  read_lock(memmap_lock)
2.  MemoryRegionSection mrs = lookup(addr)
3.  qom_ref(mrs.mr->dev)
4.  read_unlock(memmap_lock)

5.  mutex_lock(dev->lock)
6.  dispatch(&mrs, addr, data, size)
7.  mutex_unlock(dev->lock)

8.  qom_unref(mrs.mr->object)


A) Add Device object to MemoryRegions.

memory_region_add_subregion
memory_region_add_eventfd
memory_region_add_subregion_overlap


B) qom_ref details

Z) see device-hotplug.txt items



lock-model.txt
--------------


lock_device(dev) {
    if (dev_can_use_lock(dev))
        mutex_lock(dev->lock)
    else
        mutex_lock(qemu_global_mutex)
}   

dev_can_use_lock(dev)
{
    if (all services used by dev mmio handler have)
        - qemu_global_mutex
        - trylock(dev->lock) and fail
        
        then yes
    else
        then no
}


That is, from the moment in which the device uses the order
(dev->lock -> qemu_global_mutex), the qemu_global_mutex -> dev->lock 
is forbidden by the IOTHREAD or anyone else.


convert-to-unlocked.txt
-----------------------


1.  read_lock(memmap_lock)
2.  MemoryRegionSection mrs = lookup(addr)
3.  qom_ref(mrs.mr->dev)
4.  read_unlock(memmap_lock)

5.  mutex_lock(dev->lock)
6.  dispatch(&mrs, addr, data, size)
7.  mutex_unlock(dev->lock)

8.  qom_unref(mrs.mr->object)

THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE
TO UNLOCKED OPERATION.

1) qom_ref guarantees that the Device object does not disappear. However,
these operations are allowed in parallel:

    a) after 4. device memory and ioports might change or be unregistered.

       Concurrent accesses of a device that is being hotunplugged or 
       whose mappings change are responsability of the OS, so 
       incorrect emulation is not QEMU's problem. However, device
       emulation is now subject to:
            - cpu_physical_memory_map failing.
            - stl_phys* failing.
            - cpu_physical_memory_rw failing/reading from unassigned mem.

    b) what should happen with a pending cpu_physical_memory_map pointer,
       when deleting the corresponding memory region?


net.txt
--------


iothread flow
=============

1) Skip-work-if-device-locked

select(tap fd ready)
tap_send
    if (trylock(TAPState->NetClientState->dev))
        proceed;
    else
        continue; (data remains in queue)
    tap_read_packet
    qemu_send_packet_async

DRAWBACK: lock intensive.
DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
a scheme such as trylock() marks a flag saying "iothread wants lock".

Checking each subsystem for possibility:

- tap_send: OK
    - user,slirp: if not possible, use device->lock.
- qemu-char: OK
- interrupts: lock with qemu_global_mutex.
- qemutimer: 
- read time: qemu_get_clock_ns (see later).


2) Queue-work-to-vcpu-context

select(tap fd ready)
tap_send
    if (trylock(TAPState->NetClientState->dev)) {
        proceed;
    } else {
        AddToDeviceWorkQueue(work);
    }
    tap_read_packet
    qemu_send_packet_async

DRAWBACK: lock intensive
DRAWBACK: cache effects

3) VCPU-thread-notify-device-unlocked

select(tap fd ready)
tap_send
    if (trylock(TAPState->NetClientState->dev)) {
        proceed;
    } else {
        signal VCPU thread to notify IOTHREAD when unlocked.
    }
    tap_read_packet

    
GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow
same order.


PCI HOTPLUG EJECT FLOW
----------------------

1) Monitor initiated.

monitor -> qmp_device_del -> qdev_unplug -> pci_unplug_device ->
piix4_device_hotplug -> SCI interrupt -> OS removes
application users from device -> OS runs _EJ0 -> 

For hot removal, the device must be immediately ejected when OSPM calls
the _EJ0 control method. The _EJ0 control method does not return until
ejection is complete. After calling _EJ0, OSPM verifies the device no
longer exists to determine if the eject succeeded. For _HID devices,
OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the
bus driver for that device.


Avi:  A pci eject guarantees that all accesses
started after the eject completes do not see the device. 

Can you do:

- qobject_remove() 

and have the device dispatch running safely? NO, therefore pci_unplug_device
must hold dev->lock.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux