On 03/03/20 15:19, David Hildenbrand wrote: > virtio-mem wants to resize (esp. grow) ram memory regions while the guest > is already aware of them and makes use of them. Resizing a KVM slot can > only currently be done by removing it and re-adding it. While the kvm slot > is temporarily removed, VCPUs that try to read from these slots will fault. Only fetches I think? Data reads and write would be treated as MMIO accesses and they should just work (using either the old or new FlatView). > But also, other ioctls might depend on all slots being in place. > > Let's inhibit most KVM ioctls while performing the resize. Once we have an > ioctl that can perform atomic resizes (e.g., KVM_SET_USER_MEMORY_REGION > extensions), we can make inhibiting optional at runtime. > > Also, make sure to hold the kvm_slots_lock while performing both > actions (removing+re-adding). > > Note: Resizes of memory regions currently seems to happen during bootup > only, so I don't think any existing RT users should be affected. rwlocks are not efficient, they cause cache line contention. For MMIO-heavy workloads the impact will be very large (well, not that large because right now they all take the BQL, but one can always hope). I would very much prefer to add a KVM_SET_USER_MEMORY_REGION extension right away. Paolo > Cc: Richard Henderson <rth@xxxxxxxxxxx> > Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> > Cc: "Dr. David Alan Gilbert" <dgilbert@xxxxxxxxxx> > Cc: Eduardo Habkost <ehabkost@xxxxxxxxxx> > Cc: Marcel Apfelbaum <marcel.apfelbaum@xxxxxxxxx> > Cc: Igor Mammedov <imammedo@xxxxxxxxxx> > Cc: kvm@xxxxxxxxxxxxxxx > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> > --- > accel/kvm/kvm-all.c | 121 +++++++++++++++++++++++++++++++++++++++--- > include/hw/core/cpu.h | 3 ++ > 2 files changed, 117 insertions(+), 7 deletions(-) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index 439a4efe52..bba58db098 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -149,6 +149,21 @@ bool kvm_msi_use_devid; > static bool kvm_immediate_exit; > static hwaddr kvm_max_slot_size = ~0; > > +/* > + * While holding this lock in write, no new KVM ioctls can be started, but > + * kvm ioctl inhibitors will have to wait for existing ones to finish > + * (indicated by cpu->in_ioctl and kvm_in_ioctl, both updated with this lock > + * held in read when entering the ioctl). > + */ > +pthread_rwlock_t kvm_ioctl_lock; > +/* > + * Atomic counter of active KVM ioctls except > + * - The KVM ioctl inhibitor is doing an ioctl > + * - kvm_ioctl(): Harmless and not interesting for inhibitors. > + * - kvm_vcpu_ioctl(): Tracked via cpu->in_ioctl. > + */ > +static int kvm_in_ioctl; > + > static const KVMCapabilityInfo kvm_required_capabilites[] = { > KVM_CAP_INFO(USER_MEMORY), > KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), > @@ -1023,6 +1038,7 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size) > kvm_max_slot_size = max_slot_size; > } > > +/* Called with KVMMemoryListener.slots_lock held */ > static void kvm_set_phys_mem(KVMMemoryListener *kml, > MemoryRegionSection *section, bool add) > { > @@ -1052,14 +1068,12 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, > ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + > (start_addr - section->offset_within_address_space); > > - kvm_slots_lock(kml); > - > if (!add) { > do { > slot_size = MIN(kvm_max_slot_size, size); > mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); > if (!mem) { > - goto out; > + return; > } > if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { > kvm_physical_sync_dirty_bitmap(kml, section); > @@ -1079,7 +1093,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, > start_addr += slot_size; > size -= slot_size; > } while (size); > - goto out; > + return; > } > > /* register the new slot */ > @@ -1108,9 +1122,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, > ram += slot_size; > size -= slot_size; > } while (size); > - > -out: > - kvm_slots_unlock(kml); > } > > static void kvm_region_add(MemoryListener *listener, > @@ -1119,7 +1130,9 @@ static void kvm_region_add(MemoryListener *listener, > KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); > > memory_region_ref(section->mr); > + kvm_slots_lock(kml); > kvm_set_phys_mem(kml, section, true); > + kvm_slots_unlock(kml); > } > > static void kvm_region_del(MemoryListener *listener, > @@ -1127,10 +1140,68 @@ static void kvm_region_del(MemoryListener *listener, > { > KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); > > + kvm_slots_lock(kml); > kvm_set_phys_mem(kml, section, false); > + kvm_slots_unlock(kml); > memory_region_unref(section->mr); > } > > +/* > + * Certain updates (e.g., resizing memory regions) require temporarily removing > + * kvm memory slots. Make sure any ioctl sees a consistent memory slot state. > + */ > +static void kvm_ioctl_inhibit_begin(void) > +{ > + CPUState *cpu; > + > + /* > + * We allow to inhibit only when holding the BQL, so we can identify > + * when an inhibitor wants to issue an ioctl easily. > + */ > + g_assert(qemu_mutex_iothread_locked()); > + > + pthread_rwlock_wrlock(&kvm_ioctl_lock); > + > + /* Inhibiting happens rarely, we can keep things simple and spin here. */ > + while (true) { > + bool any_cpu_in_ioctl = false; > + > + CPU_FOREACH(cpu) { > + if (atomic_read(&cpu->in_ioctl)) { > + any_cpu_in_ioctl = true; > + qemu_cpu_kick(cpu); > + } > + } > + if (!any_cpu_in_ioctl && !atomic_read(&kvm_in_ioctl)) { > + break; > + } > + g_usleep(100); > + } > +} > + > +static void kvm_ioctl_inhibit_end(void) > +{ > + pthread_rwlock_unlock(&kvm_ioctl_lock); > +} > + > +static void kvm_region_resize(MemoryListener *listener, > + MemoryRegionSection *section, Int128 new) > +{ > + KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, > + listener); > + MemoryRegionSection new_section = *section; > + > + new_section.size = new; > + > + kvm_slots_lock(kml); > + /* Inhibit KVM ioctls while temporarily removing slots. */ > + kvm_ioctl_inhibit_begin(); > + kvm_set_phys_mem(kml, section, false); > + kvm_set_phys_mem(kml, &new_section, true); > + kvm_ioctl_inhibit_end(); > + kvm_slots_unlock(kml); > +} > + > static void kvm_log_sync(MemoryListener *listener, > MemoryRegionSection *section) > { > @@ -1249,6 +1320,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, > > kml->listener.region_add = kvm_region_add; > kml->listener.region_del = kvm_region_del; > + kml->listener.region_resize = kvm_region_resize; > kml->listener.log_start = kvm_log_start; > kml->listener.log_stop = kvm_log_stop; > kml->listener.log_sync = kvm_log_sync; > @@ -1894,6 +1966,7 @@ static int kvm_init(MachineState *ms) > assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size); > > s->sigmask_len = 8; > + pthread_rwlock_init(&kvm_ioctl_lock, NULL); > > #ifdef KVM_CAP_SET_GUEST_DEBUG > QTAILQ_INIT(&s->kvm_sw_breakpoints); > @@ -2304,6 +2377,34 @@ static void kvm_eat_signals(CPUState *cpu) > } while (sigismember(&chkset, SIG_IPI)); > } > > +static void kvm_cpu_set_in_ioctl(CPUState *cpu, bool in_ioctl) > +{ > + if (unlikely(qemu_mutex_iothread_locked())) { > + return; > + } > + if (in_ioctl) { > + pthread_rwlock_rdlock(&kvm_ioctl_lock); > + atomic_set(&cpu->in_ioctl, true); > + pthread_rwlock_unlock(&kvm_ioctl_lock); > + } else { > + atomic_set(&cpu->in_ioctl, false); > + } > +} > + > +static void kvm_set_in_ioctl(bool in_ioctl) > +{ > + if (likely(qemu_mutex_iothread_locked())) { > + return; > + } > + if (in_ioctl) { > + pthread_rwlock_rdlock(&kvm_ioctl_lock); > + atomic_inc(&kvm_in_ioctl); > + pthread_rwlock_unlock(&kvm_ioctl_lock); > + } else { > + atomic_dec(&kvm_in_ioctl); > + } > +} > + > int kvm_cpu_exec(CPUState *cpu) > { > struct kvm_run *run = cpu->kvm_run; > @@ -2488,7 +2589,9 @@ int kvm_vm_ioctl(KVMState *s, int type, ...) > va_end(ap); > > trace_kvm_vm_ioctl(type, arg); > + kvm_set_in_ioctl(true); > ret = ioctl(s->vmfd, type, arg); > + kvm_set_in_ioctl(false); > if (ret == -1) { > ret = -errno; > } > @@ -2506,7 +2609,9 @@ int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) > va_end(ap); > > trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); > + kvm_cpu_set_in_ioctl(cpu, true); > ret = ioctl(cpu->kvm_fd, type, arg); > + kvm_cpu_set_in_ioctl(cpu, false); > if (ret == -1) { > ret = -errno; > } > @@ -2524,7 +2629,9 @@ int kvm_device_ioctl(int fd, int type, ...) > va_end(ap); > > trace_kvm_device_ioctl(fd, type, arg); > + kvm_set_in_ioctl(true); > ret = ioctl(fd, type, arg); > + kvm_set_in_ioctl(false); > if (ret == -1) { > ret = -errno; > } > diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h > index 73e9a869a4..4fbff6f3d7 100644 > --- a/include/hw/core/cpu.h > +++ b/include/hw/core/cpu.h > @@ -431,6 +431,9 @@ struct CPUState { > /* shared by kvm, hax and hvf */ > bool vcpu_dirty; > > + /* kvm only for now: CPU is in kvm_vcpu_ioctl() (esp. KVM_RUN) */ > + bool in_ioctl; > + > /* Used to keep track of an outstanding cpu throttle thread for migration > * autoconverge > */ >