On 04/01/2015 05:42 PM, Charles Arnold wrote: > Under certain conditions I see the guest CPU usage display not > being updated. The problem seems most prevalent when Xen > is the underlying hypervisor but I have seen it occasionally with > KVM. > > What appears to be happening is that when a VM is created, the > engine's _handle_tick_queue() calls the connection tick() method > which in turn calls _tick(). In here it calls _update_vms() which > gets a new vmmDomain object. Then it spawns a new thread to > call tick_send_signals() where if 'pollvm' is True then 'self._vms' > will get the newly create vmmDomain object. > > The problem (I think) is that under certain conditions the thread > spun up by self.idle_add(tick_send_signals) doesn't run quickly > enough (and therefore doesn't set self._vms) until after _tick() returns > and is called again and second vmmDomain object is created. > > At this point it appears that the first vmmDomain object collects the > cpu stats from libvirt while the second vmmDomain object updates > the display which doesn't have the stats and therefore nothing appears. > That analysis sounds reasonable. Since tick() is running in a thread, if the mainloop is handling lots of UI stuff or similar it seems possible that tick() could be invoked twice before the mainloop handles tick_send_signals. Unfortunately our threading model is pretty hacky here since we have threads touch a bunch of shared state without any locking, but it's simple and generally works out fine. > Below are a couple approaches to solving the problem (assuming > my analysis is correct). This first one simply yields to give the > tick_send_signals thread a chance to run. > > diff --git a/virtManager/connection.py b/virtManager/connection.py > index a907a3f..af27141 100644 > --- a/virtManager/connection.py > +++ b/virtManager/connection.py > @@ -1240,6 +1240,9 @@ class vmmConnection(vmmGObject): > self._change_state(self._STATE_ACTIVE) > > self.idle_add(tick_send_signals) > + if len(self._vms) < len(vms): > + # Allow time for tick_send_signals to run > + time.sleep(.1) > > ticklist = [] > def add_to_ticklist(l, args=()): > Obviously a timeout is sketchy since it might just make it less likely to trigger. > This second solution doesn't wait for the thread and sets self._vms > if pollvm is True. > > diff --git a/virtManager/connection.py b/virtManager/connection.py > index a907a3f..96c208e 100644 > --- a/virtManager/connection.py > +++ b/virtManager/connection.py > @@ -1240,6 +1240,8 @@ class vmmConnection(vmmGObject): > self._change_state(self._STATE_ACTIVE) > > self.idle_add(tick_send_signals) > + if pollvm: > + self._vms = vms > > ticklist = [] > def add_to_ticklist(l, args=()): > > Any thoughts on this problem and the potential solutions? We can't update any of the conn lists in the thread since it's possible the self._vms is in use by the mainloop, which can cause weird errors if it's updated while another part of the code is iterating over it or similar. I think the proper thing to do short of some larger refactoring is to use a threading.Lock(). conn.tick() will grab the lock at the start, tick_send_signals will release the Lock when it's updated the conn state. So tick() can't run until the previous tick_send_signals has finished. If you can reproduce easily, does this patch fix it (and not cause issues?). I only did light testing. Not pushed yet Thanks, Cole
>From 5be81cd95512131699b07580e5513a221531afdb Mon Sep 17 00:00:00 2001 Message-Id: <5be81cd95512131699b07580e5513a221531afdb.1427929997.git.crobinso@xxxxxxxxxx> From: Cole Robinson <crobinso@xxxxxxxxxx> Date: Wed, 1 Apr 2015 19:10:16 -0400 Subject: [PATCH virt-manager] connection: Protect conn state tick() updates with a lock If the mainloop is backed up, tick_send_signals might not run before another conn.tick call is scheduled by the tick thread. conn.tick would then be operating on incorrect self._vms since it was never updated. Don't run another tick() until tick_send_signals has released a lock. https://www.redhat.com/archives/virt-tools-list/2015-April/msg00009.html Reported-by: Charles Arnold <carnold@xxxxxxxx> --- virtManager/connection.py | 78 +++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/virtManager/connection.py b/virtManager/connection.py index a907a3f..3bfd118 100644 --- a/virtManager/connection.py +++ b/virtManager/connection.py @@ -23,13 +23,14 @@ from gi.repository import GObject import logging import os import socket +import threading import time import traceback -from virtinst import support import libvirt import virtinst from virtinst import pollhelpers +from virtinst import support from virtinst import util from . import connectauth @@ -119,6 +120,8 @@ class vmmConnection(vmmGObject): self.record = [] self.hostinfo = None + self._tick_lock = threading.Lock() + self.mediadev_initialized = False self.mediadev_error = "" self.mediadev_use_libvirt = False @@ -1144,21 +1147,31 @@ class vmmConnection(vmmGObject): self.hostinfo = self._backend.getInfo() - (goneNets, newNets, nets) = self._update_nets(pollnet) - self._refresh_new_objects(newNets.values()) - (gonePools, newPools, pools) = self._update_pools(pollpool) - self._refresh_new_objects(newPools.values()) - (goneInterfaces, - newInterfaces, interfaces) = self._update_interfaces(polliface) - self._refresh_new_objects(newInterfaces.values()) - - # Refreshing these is handled by the mediadev callback - (goneNodedevs, - newNodedevs, nodedevs) = self._update_nodedevs(pollnodedev) - - # These are refreshing in their __init__ method, because the - # data is wanted immediately - (goneVMs, newVMs, vms) = self._update_vms(pollvm) + try: + # We take the ticklock before using the conn object lists + # like self._vms. This ensures that those lists were updated + # by tick_send_signals since the last time we ran tick() + # https://www.redhat.com/archives/virt-tools-list/2015-April/msg00009.html + self._tick_lock.acquire() + + (goneNets, newNets, nets) = self._update_nets(pollnet) + self._refresh_new_objects(newNets.values()) + (gonePools, newPools, pools) = self._update_pools(pollpool) + self._refresh_new_objects(newPools.values()) + (goneInterfaces, + newInterfaces, interfaces) = self._update_interfaces(polliface) + self._refresh_new_objects(newInterfaces.values()) + + # Refreshing these is handled by the mediadev callback + (goneNodedevs, + newNodedevs, nodedevs) = self._update_nodedevs(pollnodedev) + + # These are refreshing in their __init__ method, because the + # data is wanted immediately + (goneVMs, newVMs, vms) = self._update_vms(pollvm) + except: + self._tick_lock.release() + raise def tick_send_signals(): """ @@ -1166,20 +1179,23 @@ class vmmConnection(vmmGObject): updates need to go here to enable threading that doesn't block the app with long tick operations. """ - # Connection closed out from under us - if not self._backend.is_open(): - return - - if pollvm: - self._vms = vms - if pollnet: - self._nets = nets - if polliface: - self._interfaces = interfaces - if pollpool: - self._pools = pools - if pollnodedev: - self._nodedevs = nodedevs + try: + # Connection closed out from under us + if not self._backend.is_open(): + return + + if pollvm: + self._vms = vms + if pollnet: + self._nets = nets + if polliface: + self._interfaces = interfaces + if pollpool: + self._pools = pools + if pollnodedev: + self._nodedevs = nodedevs + finally: + self._tick_lock.release() if not self.mediadev_initialized: self._init_mediadev() @@ -1239,6 +1255,8 @@ class vmmConnection(vmmGObject): if finish_connecting: self._change_state(self._STATE_ACTIVE) + # Anything that could possibly fail before this call needs to go + # in the try/except that handles the tick lock self.idle_add(tick_send_signals) ticklist = [] -- 2.3.4
_______________________________________________ virt-tools-list mailing list virt-tools-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/virt-tools-list