On 28/04/14 18:39, Paolo Bonzini wrote: > From: Christian Borntraeger <borntraeger@xxxxxxxxxx> Given all your work, What about From: Paolo Bonzini <pbonzini@xxxxxxxxxx> plus "Based on an inital patch from Christian Borntraeger" > > When starting lots of dataplane devices the bootup takes very long on > Christian's s390 with irqfd patches. With larger setups he is even > able to trigger some timeouts in some components. Turns out that the > KVM_SET_GSI_ROUTING ioctl takes very long (strace claims up to 0.1 sec) > when having multiple CPUs. This is caused by the synchronize_rcu and > the HZ=100 of s390. By changing the code to use a private srcu we can > speed things up. This patch reduces the boot time till mounting root > from 8 to 2 seconds on my s390 guest with 100 disks. > > Uses of hlist_for_each_entry_rcu, hlist_add_head_rcu, hlist_del_init_rcu > are fine because they do not have lockdep checks (hlist_for_each_entry_rcu > uses rcu_dereference_raw rather than rcu_dereference, and write-sides > do not do rcu lockdep at all). > > Note that we're hardly relying on the "sleepable" part of srcu. We just > want SRCU's faster detection of grace periods. > > Testing was done by Andrew Theurer using NETPERF. The difference between > results "before" and "after" the patch has mean -0.2% and standard deviation > 0.6%. Using a paired t-test on the data points says that there is a 2.5% > probability that the patch is the cause of the performance difference > (rather than a random fluctuation). > > Cc: Marcelo Tosatti <mtosatti@xxxxxxxxxx> > Cc: Michael S. Tsirkin <mst@xxxxxxxxxx> > Signed-off-by: Christian Borntraeger <borntraeger@xxxxxxxxxx> > Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> Some questions regarding expedided vs. non expedited and a comment without a necessary action. Otherwise Reviewed-by: Christian Borntraeger <borntraeger@xxxxxxxxxx> Tested-by: Christian Borntraeger <borntraeger@xxxxxxxxxx> # on s390 > --- > include/linux/kvm_host.h | 1 + > virt/kvm/eventfd.c | 25 +++++++++++++++---------- > virt/kvm/irq_comm.c | 17 +++++++++-------- > virt/kvm/irqchip.c | 31 ++++++++++++++++--------------- > virt/kvm/kvm_main.c | 16 ++++++++++------ > 5 files changed, 51 insertions(+), 39 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 820fc2e1d9df..cd0df9a9352d 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -368,6 +368,7 @@ struct kvm { > struct mm_struct *mm; /* userspace tied to this vm */ > struct kvm_memslots *memslots; > struct srcu_struct srcu; > + struct srcu_struct irq_srcu; > #ifdef CONFIG_KVM_APIC_ARCHITECTURE > u32 bsp_vcpu_id; > #endif > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c > index 912ec5a95e2c..20c3af7692c5 100644 > --- a/virt/kvm/eventfd.c > +++ b/virt/kvm/eventfd.c > @@ -31,6 +31,7 @@ > #include <linux/list.h> > #include <linux/eventfd.h> > #include <linux/kernel.h> > +#include <linux/srcu.h> > #include <linux/slab.h> > > #include "iodev.h" > @@ -118,19 +119,22 @@ static void > irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) > { > struct _irqfd_resampler *resampler; > + struct kvm *kvm; > struct _irqfd *irqfd; > + int idx; > > resampler = container_of(kian, struct _irqfd_resampler, notifier); > + kvm = resampler->kvm; > > - kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, > + kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, > resampler->notifier.gsi, 0, false); > > - rcu_read_lock(); > + idx = srcu_read_lock(&kvm->irq_srcu); > > list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) > eventfd_signal(irqfd->resamplefd, 1); > > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > } > > static void > @@ -142,7 +146,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd) > mutex_lock(&kvm->irqfds.resampler_lock); > > list_del_rcu(&irqfd->resampler_link); > - synchronize_rcu(); > + synchronize_srcu(&kvm->irq_srcu); > > if (list_empty(&resampler->list)) { > list_del(&resampler->link); > @@ -221,17 +225,18 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) > unsigned long flags = (unsigned long)key; > struct kvm_kernel_irq_routing_entry *irq; > struct kvm *kvm = irqfd->kvm; > + int idx; > > if (flags & POLLIN) { > - rcu_read_lock(); > - irq = rcu_dereference(irqfd->irq_entry); > + idx = srcu_read_lock(&kvm->irq_srcu); > + irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu); > /* An event has been signaled, inject an interrupt */ > if (irq) > kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, > false); > else > schedule_work(&irqfd->inject); > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > } > > if (flags & POLLHUP) { > @@ -363,7 +368,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) > } > > list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); > - synchronize_rcu(); > + synchronize_srcu(&kvm->irq_srcu); No idea what resampler is, can this become time critical as well - iow do we need expedited here? > > mutex_unlock(&kvm->irqfds.resampler_lock); > } > @@ -465,7 +470,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) > * another thread calls kvm_irq_routing_update before > * we flush workqueue below (we synchronize with > * kvm_irq_routing_update using irqfds.lock). > - * It is paired with synchronize_rcu done by caller > + * It is paired with synchronize_srcu done by caller > * of that function. > */ > rcu_assign_pointer(irqfd->irq_entry, NULL); > @@ -524,7 +529,7 @@ kvm_irqfd_release(struct kvm *kvm) > > /* > * Change irq_routing and irqfd. > - * Caller must invoke synchronize_rcu afterwards. > + * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. > */ > void kvm_irq_routing_update(struct kvm *kvm, > struct kvm_irq_routing_table *irq_rt) > diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c > index e2e6b4473a96..ced4a542a031 100644 > --- a/virt/kvm/irq_comm.c > +++ b/virt/kvm/irq_comm.c > @@ -163,6 +163,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) > struct kvm_kernel_irq_routing_entry *e; > int ret = -EINVAL; > struct kvm_irq_routing_table *irq_rt; > + int idx; > > trace_kvm_set_irq(irq, level, irq_source_id); > > @@ -174,8 +175,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) > * Since there's no easy way to do this, we only support injecting MSI > * which is limited to 1:1 GSI mapping. > */ > - rcu_read_lock(); > - irq_rt = rcu_dereference(kvm->irq_routing); > + idx = srcu_read_lock(&kvm->irq_srcu); > + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); > if (irq < irq_rt->nr_rt_entries) > hlist_for_each_entry(e, &irq_rt->map[irq], link) { > if (likely(e->type == KVM_IRQ_ROUTING_MSI)) > @@ -184,7 +185,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) > ret = -EWOULDBLOCK; > break; > } > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > return ret; > } > > @@ -253,22 +254,22 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, > mutex_lock(&kvm->irq_lock); > hlist_del_rcu(&kimn->link); > mutex_unlock(&kvm->irq_lock); > - synchronize_rcu(); > + synchronize_srcu(&kvm->irq_srcu); > } > > void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, > bool mask) > { > struct kvm_irq_mask_notifier *kimn; > - int gsi; > + int idx, gsi; > > - rcu_read_lock(); > - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; > + idx = srcu_read_lock(&kvm->irq_srcu); > + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; > if (gsi != -1) > hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link) > if (kimn->irq == gsi) > kimn->func(kimn, mask); > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > } > > int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, > diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c > index 20dc9e4a8f6c..58bf5ba1aab1 100644 > --- a/virt/kvm/irqchip.c > +++ b/virt/kvm/irqchip.c > @@ -26,6 +26,7 @@ > > #include <linux/kvm_host.h> > #include <linux/slab.h> > +#include <linux/srcu.h> > #include <linux/export.h> > #include <trace/events/kvm.h> > #include "irq.h" > @@ -33,19 +34,19 @@ > bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) > { > struct kvm_irq_ack_notifier *kian; > - int gsi; > + int gsi, idx; > > - rcu_read_lock(); > - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; > + idx = srcu_read_lock(&kvm->irq_srcu); > + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; > if (gsi != -1) > hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, > link) > if (kian->gsi == gsi) { > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > return true; > } > > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > > return false; > } > @@ -54,18 +55,18 @@ EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); > void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) > { > struct kvm_irq_ack_notifier *kian; > - int gsi; > + int gsi, idx; > > trace_kvm_ack_irq(irqchip, pin); > > - rcu_read_lock(); > - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; > + idx = srcu_read_lock(&kvm->irq_srcu); > + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; > if (gsi != -1) > hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, > link) > if (kian->gsi == gsi) > kian->irq_acked(kian); > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > } > > void kvm_register_irq_ack_notifier(struct kvm *kvm, > @@ -85,7 +86,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, > mutex_lock(&kvm->irq_lock); > hlist_del_init_rcu(&kian->link); > mutex_unlock(&kvm->irq_lock); > - synchronize_rcu(); > + synchronize_srcu_expedited(&kvm->irq_srcu); Hmm, looks like all callers are slow path (shutdown, deregister assigned dev). Couldnt we use the non expedited variant? > #ifdef __KVM_HAVE_IOAPIC > kvm_vcpu_request_scan_ioapic(kvm); > #endif > @@ -115,7 +116,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, > bool line_status) > { > struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; > - int ret = -1, i = 0; > + int ret = -1, i = 0, idx; > struct kvm_irq_routing_table *irq_rt; > > trace_kvm_set_irq(irq, level, irq_source_id); > @@ -124,12 +125,12 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, > * IOAPIC. So set the bit in both. The guest will ignore > * writes to the unused one. > */ > - rcu_read_lock(); > - irq_rt = rcu_dereference(kvm->irq_routing); > + idx = srcu_read_lock(&kvm->irq_srcu); > + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); > if (irq < irq_rt->nr_rt_entries) > hlist_for_each_entry(e, &irq_rt->map[irq], link) > irq_set[i++] = *e; > - rcu_read_unlock(); > + srcu_read_unlock(&kvm->irq_srcu, idx); > > while(i--) { > int r; > @@ -226,7 +227,7 @@ int kvm_set_irq_routing(struct kvm *kvm, > kvm_irq_routing_update(kvm, new); > mutex_unlock(&kvm->irq_lock); > > - synchronize_rcu(); > + synchronize_srcu_expedited(&kvm->irq_srcu); > > new = old; > r = 0; > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index fa70c6e642b4..95b4c2b3906a 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -457,11 +457,11 @@ static struct kvm *kvm_create_vm(unsigned long type) > > r = kvm_arch_init_vm(kvm, type); > if (r) > - goto out_err_nodisable; > + goto out_err_no_disable; > > r = hardware_enable_all(); > if (r) > - goto out_err_nodisable; > + goto out_err_no_disable; > > #ifdef CONFIG_HAVE_KVM_IRQCHIP > INIT_HLIST_HEAD(&kvm->mask_notifier_list); > @@ -473,10 +473,12 @@ static struct kvm *kvm_create_vm(unsigned long type) > r = -ENOMEM; > kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); > if (!kvm->memslots) > - goto out_err_nosrcu; > + goto out_err_no_srcu; > kvm_init_memslots_id(kvm); > if (init_srcu_struct(&kvm->srcu)) > - goto out_err_nosrcu; > + goto out_err_no_srcu; > + if (init_srcu_struct(&kvm->irq_srcu)) > + goto out_err_no_irq_srcu; > for (i = 0; i < KVM_NR_BUSES; i++) { > kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), > GFP_KERNEL); > @@ -505,10 +507,12 @@ static struct kvm *kvm_create_vm(unsigned long type) > return kvm; > > out_err: > + cleanup_srcu_struct(&kvm->irq_srcu); > +out_err_no_irq_srcu: > cleanup_srcu_struct(&kvm->srcu); > -out_err_nosrcu: > +out_err_no_srcu: > hardware_disable_all(); > -out_err_nodisable: > +out_err_no_disable: the patch would be smaller without this change, but it makes the naming more consistent, so ok. > for (i = 0; i < KVM_NR_BUSES; i++) > kfree(kvm->buses[i]); > kfree(kvm->memslots); > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html