Re: [PATCH v11 2/7] KVM: stats: Add fd-based API to read binary stats data

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jun 18, 2021 at 2:01 AM Greg KH <gregkh@xxxxxxxxxxxxxxxxxxx> wrote:
>
> On Fri, Jun 18, 2021 at 04:48:14AM +0000, Jing Zhang wrote:
> > This commit defines the API for userspace and prepare the common
> > functionalities to support per VM/VCPU binary stats data readings.
> >
> > The KVM stats now is only accessible by debugfs, which has some
> > shortcomings this change series are supposed to fix:
> > 1. The current debugfs stats solution in KVM could be disabled
> >    when kernel Lockdown mode is enabled, which is a potential
> >    rick for production.
> > 2. The current debugfs stats solution in KVM is organized as "one
> >    stats per file", it is good for debugging, but not efficient
> >    for production.
> > 3. The stats read/clear in current debugfs solution in KVM are
> >    protected by the global kvm_lock.
> >
> > Besides that, there are some other benefits with this change:
> > 1. All KVM VM/VCPU stats can be read out in a bulk by one copy
> >    to userspace.
> > 2. A schema is used to describe KVM statistics. From userspace's
> >    perspective, the KVM statistics are self-describing.
> > 3. With the fd-based solution, a separate telemetry would be able
> >    to read KVM stats in a less privileged environment.
> > 4. After the initial setup by reading in stats descriptors, a
> >    telemetry only needs to read the stats data itself, no more
> >    parsing or setup is needed.
> >
> > Reviewed-by: David Matlack <dmatlack@xxxxxxxxxx>
> > Reviewed-by: Ricardo Koller <ricarkol@xxxxxxxxxx>
> > Reviewed-by: Krish Sadhukhan <krish.sadhukhan@xxxxxxxxxx>
> > Reviewed-by: Fuad Tabba <tabba@xxxxxxxxxx>
> > Tested-by: Fuad Tabba <tabba@xxxxxxxxxx> #arm64
> > Signed-off-by: Jing Zhang <jingzhangos@xxxxxxxxxx>
> > ---
> >  arch/arm64/kvm/Makefile   |   2 +-
> >  arch/mips/kvm/Makefile    |   2 +-
> >  arch/powerpc/kvm/Makefile |   2 +-
> >  arch/s390/kvm/Makefile    |   3 +-
> >  arch/x86/kvm/Makefile     |   2 +-
> >  include/linux/kvm_host.h  | 145 ++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/kvm.h  |  42 +++++++++++
> >  virt/kvm/binary_stats.c   | 130 ++++++++++++++++++++++++++++++++++
> >  8 files changed, 323 insertions(+), 5 deletions(-)
> >  create mode 100644 virt/kvm/binary_stats.c
> >
> > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> > index 589921392cb1..989bb5dad2c8 100644
> > --- a/arch/arm64/kvm/Makefile
> > +++ b/arch/arm64/kvm/Makefile
> > @@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
> >  obj-$(CONFIG_KVM) += hyp/
> >
> >  kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
> > -      $(KVM)/vfio.o $(KVM)/irqchip.o \
> > +      $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
> >        arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
> >        inject_fault.o va_layout.o handle_exit.o \
> >        guest.o debug.o reset.o sys_regs.o \
> > diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
> > index 30cc060857c7..c67250a956b8 100644
> > --- a/arch/mips/kvm/Makefile
> > +++ b/arch/mips/kvm/Makefile
> > @@ -2,7 +2,7 @@
> >  # Makefile for KVM support for MIPS
> >  #
> >
> > -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o)
> > +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
> >
> >  EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
> >
> > diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> > index 2bfeaa13befb..b347d043b932 100644
> > --- a/arch/powerpc/kvm/Makefile
> > +++ b/arch/powerpc/kvm/Makefile
> > @@ -6,7 +6,7 @@
> >  ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
> >  KVM := ../../../virt/kvm
> >
> > -common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
> > +common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
> >  common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
> >  common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
> >
> > diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
> > index 12decca22e7c..b3aaadc60ead 100644
> > --- a/arch/s390/kvm/Makefile
> > +++ b/arch/s390/kvm/Makefile
> > @@ -4,7 +4,8 @@
> >  # Copyright IBM Corp. 2008
> >
> >  KVM := ../../../virt/kvm
> > -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
> > +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o \
> > +           $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
> >
> >  ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
> >
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index 83331376b779..75dfd27b6e8a 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -11,7 +11,7 @@ KVM := ../../../virt/kvm
> >
> >  kvm-y                        += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
> >                               $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
> > -                             $(KVM)/dirty_ring.o
> > +                             $(KVM)/dirty_ring.o $(KVM)/binary_stats.o
> >  kvm-$(CONFIG_KVM_ASYNC_PF)   += $(KVM)/async_pf.o
> >
> >  kvm-y                        += x86.o emulate.o i8259.o irq.o lapic.o \
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 5a31e0696360..2f0d12064ae7 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1272,6 +1272,12 @@ struct kvm_stats_debugfs_item {
> >       int mode;
> >  };
> >
> > +#define KVM_STATS_NAME_LEN   48
> > +struct _kvm_stats_desc {
> > +     struct kvm_stats_desc desc;
> > +     char name[KVM_STATS_NAME_LEN];
> > +};
> > +
> >  #define KVM_DBGFS_GET_MODE(dbgfs_item)                                         \
> >       ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
> >
> > @@ -1285,8 +1291,147 @@ struct kvm_stats_debugfs_item {
> >       { n, offsetof(struct kvm_vcpu, stat.generic.x),                        \
> >         KVM_STAT_VCPU, ## __VA_ARGS__ }
> >
> > +#define STATS_DESC_COMMON(type, unit, base, exp)                            \
> > +     .flags = type | unit | base |                                          \
> > +         BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |                   \
> > +         BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |                   \
> > +         BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),                    \
> > +     .exponent = exp,                                                       \
> > +     .size = 1
> > +
> > +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp)                  \
> > +     {                                                                      \
> > +             {                                                              \
> > +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> > +                     .offset = offsetof(struct kvm_vm_stat, generic.stat)   \
> > +             },                                                             \
> > +             .name = #stat,                                                 \
> > +     }
> > +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp)                \
> > +     {                                                                      \
> > +             {                                                              \
> > +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> > +                     .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
> > +             },                                                             \
> > +             .name = #stat,                                                 \
> > +     }
> > +#define VM_STATS_DESC(stat, type, unit, base, exp)                          \
> > +     {                                                                      \
> > +             {                                                              \
> > +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> > +                     .offset = offsetof(struct kvm_vm_stat, stat)           \
> > +             },                                                             \
> > +             .name = #stat,                                                 \
> > +     }
> > +#define VCPU_STATS_DESC(stat, type, unit, base, exp)                        \
> > +     {                                                                      \
> > +             {                                                              \
> > +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> > +                     .offset = offsetof(struct kvm_vcpu_stat, stat)         \
> > +             },                                                             \
> > +             .name = #stat,                                                 \
> > +     }
> > +/* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
> > +#define STATS_DESC(SCOPE, stat, type, unit, base, exp)                              \
> > +     SCOPE##_STATS_DESC(stat, type, unit, base, exp)
> > +
> > +#define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)            \
> > +     STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE,                     \
> > +                   unit, base, exponent)
> > +#define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)                       \
> > +     STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent)  \
> > +
> > +/* Cumulative counter */
> > +#define STATS_DESC_COUNTER(SCOPE, name)                                             \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE,                \
> > +             KVM_STATS_BASE_POW10, 0)
> > +/* Instantaneous counter */
> > +#define STATS_DESC_ICOUNTER(SCOPE, name)                                    \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE,                   \
> > +             KVM_STATS_BASE_POW10, 0)
> > +
> > +/* Cumulative clock cycles */
> > +#define STATS_DESC_CYCLE(SCOPE, name)                                               \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_CYCLES,              \
> > +             KVM_STATS_BASE_POW10, 0)
> > +/* Instantaneous clock cycles */
> > +#define STATS_DESC_ICYCLE(SCOPE, name)                                              \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_CYCLES,                 \
> > +             KVM_STATS_BASE_POW10, 0)
> > +
> > +/* Cumulative memory size in Byte */
> > +#define STATS_DESC_SIZE_BYTE(SCOPE, name)                                   \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> > +             KVM_STATS_BASE_POW2, 0)
> > +/* Cumulative memory size in KiByte */
> > +#define STATS_DESC_SIZE_KBYTE(SCOPE, name)                                  \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> > +             KVM_STATS_BASE_POW2, 10)
> > +/* Cumulative memory size in MiByte */
> > +#define STATS_DESC_SIZE_MBYTE(SCOPE, name)                                  \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> > +             KVM_STATS_BASE_POW2, 20)
> > +/* Cumulative memory size in GiByte */
> > +#define STATS_DESC_SIZE_GBYTE(SCOPE, name)                                  \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> > +             KVM_STATS_BASE_POW2, 30)
> > +
> > +/* Instantaneous memory size in Byte */
> > +#define STATS_DESC_ISIZE_BYTE(SCOPE, name)                                  \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> > +             KVM_STATS_BASE_POW2, 0)
> > +/* Instantaneous memory size in KiByte */
> > +#define STATS_DESC_ISIZE_KBYTE(SCOPE, name)                                 \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> > +             KVM_STATS_BASE_POW2, 10)
> > +/* Instantaneous memory size in MiByte */
> > +#define STATS_DESC_ISIZE_MBYTE(SCOPE, name)                                 \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> > +             KVM_STATS_BASE_POW2, 20)
> > +/* Instantaneous memory size in GiByte */
> > +#define STATS_DESC_ISIZE_GBYTE(SCOPE, name)                                 \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> > +             KVM_STATS_BASE_POW2, 30)
> > +
> > +/* Cumulative time in second */
> > +#define STATS_DESC_TIME_SEC(SCOPE, name)                                    \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> > +             KVM_STATS_BASE_POW10, 0)
> > +/* Cumulative time in millisecond */
> > +#define STATS_DESC_TIME_MSEC(SCOPE, name)                                   \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> > +             KVM_STATS_BASE_POW10, -3)
> > +/* Cumulative time in microsecond */
> > +#define STATS_DESC_TIME_USEC(SCOPE, name)                                   \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> > +             KVM_STATS_BASE_POW10, -6)
> > +/* Cumulative time in nanosecond */
> > +#define STATS_DESC_TIME_NSEC(SCOPE, name)                                   \
> > +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> > +             KVM_STATS_BASE_POW10, -9)
> > +
> > +/* Instantaneous time in second */
> > +#define STATS_DESC_ITIME_SEC(SCOPE, name)                                   \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> > +             KVM_STATS_BASE_POW10, 0)
> > +/* Instantaneous time in millisecond */
> > +#define STATS_DESC_ITIME_MSEC(SCOPE, name)                                  \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> > +             KVM_STATS_BASE_POW10, -3)
> > +/* Instantaneous time in microsecond */
> > +#define STATS_DESC_ITIME_USEC(SCOPE, name)                                  \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> > +             KVM_STATS_BASE_POW10, -6)
> > +/* Instantaneous time in nanosecond */
> > +#define STATS_DESC_ITIME_NSEC(SCOPE, name)                                  \
> > +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> > +             KVM_STATS_BASE_POW10, -9)
> > +
> >  extern struct kvm_stats_debugfs_item debugfs_entries[];
> >  extern struct dentry *kvm_debugfs_dir;
> > +ssize_t kvm_stats_read(char *id, struct kvm_stats_header *header,
> > +             struct _kvm_stats_desc *desc, void *stats, size_t size_stats,
> > +             char __user *user_buffer, size_t size, loff_t *offset);
> >
> >  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> >  static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 9febe1412f7a..ab73e905105c 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -1086,6 +1086,7 @@ struct kvm_ppc_resize_hpt {
> >  #define KVM_CAP_HYPERV_ENFORCE_CPUID 199
> >  #define KVM_CAP_SREGS2 200
> >  #define KVM_CAP_EXIT_HYPERCALL 201
> > +#define KVM_CAP_BINARY_STATS_FD 202
> >
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >
> > @@ -1905,4 +1906,45 @@ struct kvm_dirty_gfn {
> >  #define KVM_BUS_LOCK_DETECTION_OFF             (1 << 0)
> >  #define KVM_BUS_LOCK_DETECTION_EXIT            (1 << 1)
> >
> > +#define KVM_STATS_ID_MAXLEN          64
> > +
> > +struct kvm_stats_header {
> > +     __u32 name_size;
> > +     __u32 count;
> > +     __u32 desc_offset;
> > +     __u32 data_offset;
> > +     char id[];
> > +};
>
> You mentioned before that the size of this really is the size of the
> structure + KVM_STATS_ID_MAXLEN, right?  Or is it - KVM_STATS_ID_MAXLEN?
>
> If so, why not put that value explicitly in:
>         char id[THE_REST_OF_THE_HEADER_SPACE];
>
> As this is not a variable header size at all, and you can not change it
> going forward, so the variable length array here feels disingenuous.
The size of header is not supposed to change, it is sizeof(struct
kvm_stats_header) +
KVM_STATS_ID_MAXLEN. Userspace needs this when reading the header.
Sorry for the confusion, will make that clear.

The reason char id[] is used instead of char id[KVM_STATS_ID_MAXLEN] is because
the header is only defined (and can be const as you mentioned) for
every architecture, but
the id string is different for every VM and VCPU. The content of the
header is combined
on-the-fly in the kvm_stats_read function.
If we use id[KVM_STATS_ID_MAXLEN] here, then we will waste some memory in every
header definition.
Another option is to define another internal header structure without
id[] field, which can be
used to define the const part of header for every architecture. But
that will have two structures
for the same header, we need to make sure these two structures are the
same except the id[]
field.

So, Greg, Paolo, What do you think? Which solution should we choose
from 3 options?
>
>
>
> > +
> > +#define KVM_STATS_TYPE_SHIFT         0
> > +#define KVM_STATS_TYPE_MASK          (0xF << KVM_STATS_TYPE_SHIFT)
> > +#define KVM_STATS_TYPE_CUMULATIVE    (0x0 << KVM_STATS_TYPE_SHIFT)
> > +#define KVM_STATS_TYPE_INSTANT               (0x1 << KVM_STATS_TYPE_SHIFT)
> > +#define KVM_STATS_TYPE_MAX           KVM_STATS_TYPE_INSTANT
> > +
> > +#define KVM_STATS_UNIT_SHIFT         4
> > +#define KVM_STATS_UNIT_MASK          (0xF << KVM_STATS_UNIT_SHIFT)
> > +#define KVM_STATS_UNIT_NONE          (0x0 << KVM_STATS_UNIT_SHIFT)
> > +#define KVM_STATS_UNIT_BYTES         (0x1 << KVM_STATS_UNIT_SHIFT)
> > +#define KVM_STATS_UNIT_SECONDS               (0x2 << KVM_STATS_UNIT_SHIFT)
> > +#define KVM_STATS_UNIT_CYCLES                (0x3 << KVM_STATS_UNIT_SHIFT)
> > +#define KVM_STATS_UNIT_MAX           KVM_STATS_UNIT_CYCLES
> > +
> > +#define KVM_STATS_BASE_SHIFT         8
> > +#define KVM_STATS_BASE_MASK          (0xF << KVM_STATS_BASE_SHIFT)
> > +#define KVM_STATS_BASE_POW10         (0x0 << KVM_STATS_BASE_SHIFT)
> > +#define KVM_STATS_BASE_POW2          (0x1 << KVM_STATS_BASE_SHIFT)
> > +#define KVM_STATS_BASE_MAX           KVM_STATS_BASE_POW2
> > +
> > +struct kvm_stats_desc {
> > +     __u32 flags;
> > +     __s16 exponent;
> > +     __u16 size;
> > +     __u32 offset;
> > +     __u32 unused;
> > +     char name[];
> > +};
>
> What is the max length of name?
>
As Paolo mentioned, it is read from the header.
> Why aren't these structures defined here in kerneldoc so that we can
> understand them better?  Putting them in a .rst file guarantees they
> will get out of sync, and you can always directly import the kerneldoc
> into the .rst file.
>
> thanks,
>
> greg k-h



[Index of Archives]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux