[PATCH] qemu_agent: Issue guest-sync prior to every command

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



If we issue guest command and GA is not running, the issuing thread
will block endlessly. We can check for GA presence by issuing
guest-sync with unique ID (timestamp). We don't want to issue real
command as even if GA is not running, once it is started, it process
all commands written to GA socket.

If we receive reply, it might be from a previous sync command, so
we need to compare received ID for equality with the given one.
---

Some background on this:
-GA socked is always writeable, even with no GA running (!)

-writing something to dead socket (=without any GA listening) will thus
 succeed. However, when one will (after a period of time) start the GA,
 it will read and process all written data. This makes timeout
 implementation harder.

-Therefore issuing a GA command without any GA running will block indefinitely.

My solution to this is: Issue harmless guest-sync command which returns
given identifier. To avoid blocking, do a timed wait (5 seconds).
This is the only case where we do a timed wait, regular GA commands
will of course wait without any timeout.
If GA didn't reply within those 5 seconds, we store an ID so we can compare
it later and report 'GA unavailable'; Note that guest won't get into
a different state as libvirt thinks it is in.
Then, if GA is ever started, it start to process our sync commands.
And all we have to do is check if we have ever issued such ID.
If not an error is reported.

However, there is still race:
1) GA will reply to our guest-sync
2) GA gets terminated
3) we issue regular command (e.g. fs-freeze)

I am not sure this is avoidable. But this patch makes current situation bearable.

 src/qemu/qemu_agent.c |  165 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 159 insertions(+), 6 deletions(-)

diff --git a/src/qemu/qemu_agent.c b/src/qemu/qemu_agent.c
index 9df5546..5ecf893 100644
--- a/src/qemu/qemu_agent.c
+++ b/src/qemu/qemu_agent.c
@@ -39,6 +39,7 @@
 #include "virterror_internal.h"
 #include "json.h"
 #include "virfile.h"
+#include "virtime.h"
 
 #define VIR_FROM_THIS VIR_FROM_QEMU
 
@@ -106,6 +107,10 @@ struct _qemuAgent {
     /* If anything went wrong, this will be fed back
      * the next monitor msg */
     virError lastError;
+
+    /* Array of IDs issued to GA via guest-sync */
+    unsigned long long *ids;
+    size_t ids_size;
 };
 
 #if DEBUG_RAW_IO
@@ -147,6 +152,7 @@ static void qemuAgentFree(qemuAgentPtr mon)
         (mon->cb->destroy)(mon, mon->vm);
     ignore_value(virCondDestroy(&mon->notify));
     virMutexDestroy(&mon->lock);
+    VIR_FREE(mon->ids);
     VIR_FREE(mon->buffer);
     VIR_FREE(mon);
 }
@@ -316,6 +322,8 @@ qemuAgentIOProcessLine(qemuAgentPtr mon,
 {
     virJSONValuePtr obj = NULL;
     int ret = -1;
+    size_t i;
+    unsigned long long id;
 
     VIR_DEBUG("Line [%s]", line);
 
@@ -340,6 +348,40 @@ qemuAgentIOProcessLine(qemuAgentPtr mon,
             obj = NULL;
             ret = 0;
         } else {
+            /* If we've received something like:
+             *  {"return": 1234}
+             * it is likely that somebody started GA
+             * which is now processing our previous
+             * guest-sync command. Check this and if so,
+             * don't report an error however return silently
+             */
+            if (virJSONValueObjectGetNumberUlong(obj, "return", &id) == 0) {
+                for (i = 0; i < mon->ids_size; i++) {
+                    if (mon->ids[i] != id)
+                        continue;
+
+                    VIR_DEBUG("Received delayed reply to guest-sync: %llu",
+                              id);
+
+                    if (mon->ids_size > 1) {
+                        memmove(&mon->ids[i],
+                                &mon->ids[i + 1],
+                                sizeof(*mon->ids) * (mon->ids_size - (i + 1)));
+
+                        mon->ids_size--;
+                        if (VIR_REALLOC_N(mon->ids, mon->ids_size) < 0) {
+                            /* Ignore, harmless */
+                        }
+                    } else {
+                        VIR_FREE(mon->ids);
+                        mon->ids_size = 0;
+                    }
+
+                    ret = 0;
+                    goto cleanup;
+                }
+            }
+
             qemuReportError(VIR_ERR_INTERNAL_ERROR,
                             _("Unexpected JSON reply '%s'"), line);
         }
@@ -813,11 +855,28 @@ void qemuAgentClose(qemuAgentPtr mon)
         qemuAgentUnlock(mon);
 }
 
+#define QEMU_AGENT_WAIT_TIME (1000ull * 5)
 
+/**
+ * qemuAgentSend:
+ * @mon: Monitor
+ * @msg: Message
+ * @timeout: use timeout?
+ *
+ * Send @msg to agent @mon.
+ * Wait max QEMU_AGENT_WAIT_TIME for agent
+ * to reply.
+ *
+ * Returns: 0 on success,
+ *          -2 on timeout,
+ *          -1 otherwise
+ */
 static int qemuAgentSend(qemuAgentPtr mon,
-                         qemuAgentMessagePtr msg)
+                         qemuAgentMessagePtr msg,
+                         bool timeout)
 {
     int ret = -1;
+    unsigned long long now, then;
 
     /* Check whether qemu quit unexpectedly */
     if (mon->lastError.code != VIR_ERR_OK) {
@@ -827,13 +886,26 @@ static int qemuAgentSend(qemuAgentPtr mon,
         return -1;
     }
 
+    if (timeout) {
+        if (virTimeMillisNow(&now) < 0)
+            return -1;
+        then = now + QEMU_AGENT_WAIT_TIME;
+    }
+
     mon->msg = msg;
     qemuAgentUpdateWatch(mon);
 
     while (!mon->msg->finished) {
-        if (virCondWait(&mon->notify, &mon->lock) < 0) {
-            qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
-                            _("Unable to wait on monitor condition"));
+        if ((timeout && virCondWaitUntil(&mon->notify, &mon->lock, then) < 0) ||
+            (!timeout && virCondWait(&mon->notify, &mon->lock) < 0)) {
+            if (errno == ETIMEDOUT) {
+                qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                                _("Guest agent not available for now"));
+                ret = -2;
+            } else {
+                qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                                _("Unable to wait on monitor condition"));
+            }
             goto cleanup;
         }
     }
@@ -855,6 +927,83 @@ cleanup:
 }
 
 
+/**
+ * qemuAgentGuestSync:
+ * @mon: Monitor
+ *
+ * Send guest-sync with unique ID and wait for
+ * reply. If we get one, check if received ID
+ * is equal to given.
+ *
+ * Returns: 0 on success,
+ *          -1 otherwise
+ */
+static int
+qemuAgentGuestSync(qemuAgentPtr mon)
+{
+    int ret = -1;
+    int send_ret;
+    unsigned long long id, id_ret;
+    qemuAgentMessage sync_msg;
+
+    memset(&sync_msg, 0, sizeof sync_msg);
+
+    if (virTimeMillisNow(&id) < 0)
+        return -1;
+
+    if (virAsprintf(&sync_msg.txBuffer,
+                    "{\"execute\":\"guest-sync\", "
+                    "\"arguments\":{\"id\":%llu}}", id) < 0) {
+        virReportOOMError();
+        return -1;
+    }
+    sync_msg.txLength = strlen(sync_msg.txBuffer);
+
+    VIR_DEBUG("Sending guest-sync command with ID: %llu", id);
+
+    send_ret = qemuAgentSend(mon, &sync_msg, true);
+
+    VIR_DEBUG("Agent guest-sync returned: %d", send_ret);
+
+    if (send_ret == -1) {
+        /* error reported */
+        goto cleanup;
+    } else if (send_ret == -2) {
+        /* don't report errors as we don't want to
+         * overwrite the one from qemuAgentSend() */
+        if (VIR_REALLOC_N(mon->ids, mon->ids_size) == 0)
+            mon->ids[mon->ids_size++] = id;
+        goto cleanup;
+    }
+
+    if (!sync_msg.rxObject) {
+        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                        _("Missing monitor reply object"));
+        goto cleanup;
+    }
+
+    if (virJSONValueObjectGetNumberUlong(sync_msg.rxObject,
+                                         "return", &id_ret) < 0) {
+        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                        _("Malformed return value"));
+        goto cleanup;
+    }
+
+    VIR_DEBUG("Guest returned ID: %llu", id_ret);
+    if (id_ret != id) {
+        qemuReportError(VIR_ERR_INTERNAL_ERROR,
+                        _("Guest agent returned ID: %llu instead of %llu"),
+                        id_ret, id);
+        goto cleanup;
+    }
+    ret = 0;
+
+cleanup:
+    virJSONValueFree(sync_msg.rxObject);
+    VIR_FREE(sync_msg.txBuffer);
+    return ret;
+}
+
 static int
 qemuAgentCommand(qemuAgentPtr mon,
                  virJSONValuePtr cmd,
@@ -866,6 +1015,11 @@ qemuAgentCommand(qemuAgentPtr mon,
 
     *reply = NULL;
 
+    if (qemuAgentGuestSync(mon) < 0) {
+        /* helper reported the error */
+        return -1;
+    }
+
     memset(&msg, 0, sizeof msg);
 
     if (!(cmdstr = virJSONValueToString(cmd))) {
@@ -880,12 +1034,11 @@ qemuAgentCommand(qemuAgentPtr mon,
 
     VIR_DEBUG("Send command '%s' for write", cmdstr);
 
-    ret = qemuAgentSend(mon, &msg);
+    ret = qemuAgentSend(mon, &msg, false);
 
     VIR_DEBUG("Receive command reply ret=%d rxObject=%p",
               ret, msg.rxObject);
 
-
     if (ret == 0) {
         if (!msg.rxObject) {
             qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
-- 
1.7.3.4

--
libvir-list mailing list
libvir-list@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/libvir-list


[Index of Archives]     [Virt Tools]     [Libvirt Users]     [Lib OS Info]     [Fedora Users]     [Fedora Desktop]     [Fedora SELinux]     [Big List of Linux Books]     [Yosemite News]     [KDE Users]     [Fedora Tools]