[PATCH 1/4] NUMA: change existing NUMA guest code to use new bitmap implementation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The current NUMA guest implementation uses a "poor-man's-bitmap"
consisting of a single uint64_t. This patch reworks this by
leveraging the new generic bitmap code and thus lifts the 64 VCPUs
limit for NUMA guests.
Beside that it improves the NUMA data structures in preparation
for future host binding code.

Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>
---
 cpus.c    |    2 +-
 hw/pc.c   |    4 +-
 monitor.c |    2 +-
 sysemu.h  |   11 ++++++-
 vl.c      |   94 +++++++++++++++++++++++++++++++++++++++----------------------
 5 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/cpus.c b/cpus.c
index 2e40814..86a0a47 100644
--- a/cpus.c
+++ b/cpus.c
@@ -805,7 +805,7 @@ void set_numa_modes(void)
 
     for (env = first_cpu; env != NULL; env = env->next_cpu) {
         for (i = 0; i < nb_numa_nodes; i++) {
-            if (node_cpumask[i] & (1 << env->cpu_index)) {
+            if (test_bit(env->cpu_index, numa_info[i].guest_cpu)) {
                 env->numa_node = i;
             }
         }
diff --git a/hw/pc.c b/hw/pc.c
index 89bd4af..1b24409 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -529,14 +529,14 @@ static void *bochs_bios_init(void)
     numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
     for (i = 0; i < smp_cpus; i++) {
         for (j = 0; j < nb_numa_nodes; j++) {
-            if (node_cpumask[j] & (1 << i)) {
+            if (test_bit(i, numa_info[j].guest_cpu)) {
                 numa_fw_cfg[i + 1] = cpu_to_le64(j);
                 break;
             }
         }
     }
     for (i = 0; i < nb_numa_nodes; i++) {
-        numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
+        numa_fw_cfg[smp_cpus + 1 + i] = cpu_to_le64(numa_info[i].guest_mem);
     }
     fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
                      (1 + smp_cpus + nb_numa_nodes) * 8);
diff --git a/monitor.c b/monitor.c
index e51df62..74da6c4 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1983,7 +1983,7 @@ static void do_info_numa(Monitor *mon)
         }
         monitor_printf(mon, "\n");
         monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i,
-            node_mem[i] >> 20);
+            numa_info[i].guest_mem >> 20);
     }
 }
 
diff --git a/sysemu.h b/sysemu.h
index bf1d68a..e5f88d1 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -7,6 +7,7 @@
 #include "qemu-queue.h"
 #include "qemu-timer.h"
 #include "notify.h"
+#include "bitmap.h"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -136,9 +137,15 @@ extern QEMUClock *rtc_clock;
 extern long hpagesize;
 
 #define MAX_NODES 64
+#ifndef MAX_NUMA_VCPUS
+#define MAX_NUMA_VCPUS 256
+#endif
 extern int nb_numa_nodes;
-extern uint64_t node_mem[MAX_NODES];
-extern uint64_t node_cpumask[MAX_NODES];
+struct numa_info {
+    uint64_t guest_mem;
+    DECLARE_BITMAP(guest_cpu, MAX_NUMA_VCPUS);
+};
+extern struct numa_info numa_info[MAX_NODES];
 
 #define MAX_OPTION_ROMS 16
 extern const char *option_rom[MAX_OPTION_ROMS];
diff --git a/vl.c b/vl.c
index 3d8298e..40fac59 100644
--- a/vl.c
+++ b/vl.c
@@ -161,6 +161,7 @@ int main(int argc, char **argv)
 #include "qemu-queue.h"
 #include "cpus.h"
 #include "arch_init.h"
+#include "bitmap.h"
 
 //#define DEBUG_NET
 //#define DEBUG_SLIRP
@@ -230,8 +231,7 @@ const char *nvram = NULL;
 int boot_menu;
 
 int nb_numa_nodes;
-uint64_t node_mem[MAX_NODES];
-uint64_t node_cpumask[MAX_NODES];
+struct numa_info numa_info[MAX_NODES];
 
 static QEMUTimer *nographic_timer;
 
@@ -717,11 +717,51 @@ static void restore_boot_devices(void *opaque)
     qemu_free(standard_boot_devices);
 }
 
+static int parse_bitmap(const char *str, unsigned long *bm, int maxlen)
+{
+    unsigned long long value, endvalue;
+    char *endptr;
+    unsigned int flags = 0;
+
+    if (str[0] == '!') {
+        flags |= 2;
+        bitmap_fill(bm, maxlen);
+        str++;
+    }
+    if (str[0] == '+') {
+        flags |= 1;
+        str++;
+    }
+    value = strtoull(str, &endptr, 10);
+    if (endptr == str) {
+        if (strcmp(str, "all"))
+            return -1;
+        bitmap_fill(bm, maxlen);
+        return flags;
+    }
+    if (value >= maxlen)
+        return -value;
+    if (*endptr == '-') {
+        endvalue = strtoull(endptr + 1, &endptr, 10);
+        if (endvalue >= maxlen)
+            endvalue = maxlen;
+    } else {
+        endvalue = value;
+    }
+
+    if (flags & 2)
+        bitmap_clear(bm, value, endvalue + 1 - value);
+    else
+        bitmap_set(bm, value, endvalue + 1 - value);
+
+    return flags;
+}
+
 static void numa_add(const char *optarg)
 {
     char option[128];
     char *endptr;
-    unsigned long long value, endvalue;
+    unsigned long long value;
     int nodenr;
 
     optarg = get_opt_name(option, 128, optarg, ',') + 1;
@@ -733,7 +773,7 @@ static void numa_add(const char *optarg)
         }
 
         if (get_param_value(option, 128, "mem", optarg) == 0) {
-            node_mem[nodenr] = 0;
+            numa_info[nodenr].guest_mem = 0;
         } else {
             value = strtoull(option, &endptr, 0);
             switch (*endptr) {
@@ -744,29 +784,12 @@ static void numa_add(const char *optarg)
                 value <<= 30;
                 break;
             }
-            node_mem[nodenr] = value;
+            numa_info[nodenr].guest_mem = value;
         }
         if (get_param_value(option, 128, "cpus", optarg) == 0) {
-            node_cpumask[nodenr] = 0;
+            bitmap_zero(numa_info[nodenr].guest_cpu, MAX_NUMA_VCPUS);
         } else {
-            value = strtoull(option, &endptr, 10);
-            if (value >= 64) {
-                value = 63;
-                fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
-            } else {
-                if (*endptr == '-') {
-                    endvalue = strtoull(endptr+1, &endptr, 10);
-                    if (endvalue >= 63) {
-                        endvalue = 62;
-                        fprintf(stderr,
-                            "only 63 CPUs in NUMA mode supported.\n");
-                    }
-                    value = (2ULL << endvalue) - (1ULL << value);
-                } else {
-                    value = 1ULL << value;
-                }
-            }
-            node_cpumask[nodenr] = value;
+            parse_bitmap(option, numa_info[nodenr].guest_cpu, MAX_NUMA_VCPUS);
         }
         nb_numa_nodes++;
     }
@@ -1870,8 +1893,8 @@ int main(int argc, char **argv, char **envp)
     translation = BIOS_ATA_TRANSLATION_AUTO;
 
     for (i = 0; i < MAX_NODES; i++) {
-        node_mem[i] = 0;
-        node_cpumask[i] = 0;
+        numa_info[i].guest_mem = 0;
+        bitmap_zero(numa_info[i].guest_cpu, MAX_NUMA_VCPUS);
     }
 
     assigned_devices_index = 0;
@@ -2887,7 +2910,7 @@ int main(int argc, char **argv, char **envp)
          * and distribute the available memory equally across all nodes
          */
         for (i = 0; i < nb_numa_nodes; i++) {
-            if (node_mem[i] != 0)
+            if (numa_info[i].guest_mem != 0)
                 break;
         }
         if (i == nb_numa_nodes) {
@@ -2897,14 +2920,18 @@ int main(int argc, char **argv, char **envp)
              * the final node gets the rest.
              */
             for (i = 0; i < nb_numa_nodes - 1; i++) {
-                node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
-                usedmem += node_mem[i];
+                numa_info[i].guest_mem = (ram_size / nb_numa_nodes) &
+                    ~((1 << 23UL) - 1);
+                usedmem += numa_info[i].guest_mem;
             }
-            node_mem[i] = ram_size - usedmem;
+            numa_info[i].guest_mem = ram_size - usedmem;
         }
 
+        /* check whether any guest CPU number has been specified.
+         * If not, we use an automatic assignment algorithm.
+         */
         for (i = 0; i < nb_numa_nodes; i++) {
-            if (node_cpumask[i] != 0)
+            if (!bitmap_empty(numa_info[i].guest_cpu, MAX_NUMA_VCPUS))
                 break;
         }
         /* assigning the VCPUs round-robin is easier to implement, guest OSes
@@ -2912,9 +2939,8 @@ int main(int argc, char **argv, char **envp)
          * real machines which also use this scheme.
          */
         if (i == nb_numa_nodes) {
-            for (i = 0; i < smp_cpus; i++) {
-                node_cpumask[i % nb_numa_nodes] |= 1 << i;
-            }
+            for (i = 0; i < smp_cpus; i++)
+                set_bit(i, numa_info[i % nb_numa_nodes].guest_cpu);
         }
     }
 
-- 
1.6.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux