Re: [PATCH v11 2/7] KVM: stats: Add fd-based API to read binary stats data

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jun 18, 2021 at 04:48:14AM +0000, Jing Zhang wrote:
> This commit defines the API for userspace and prepare the common
> functionalities to support per VM/VCPU binary stats data readings.
> 
> The KVM stats now is only accessible by debugfs, which has some
> shortcomings this change series are supposed to fix:
> 1. The current debugfs stats solution in KVM could be disabled
>    when kernel Lockdown mode is enabled, which is a potential
>    rick for production.
> 2. The current debugfs stats solution in KVM is organized as "one
>    stats per file", it is good for debugging, but not efficient
>    for production.
> 3. The stats read/clear in current debugfs solution in KVM are
>    protected by the global kvm_lock.
> 
> Besides that, there are some other benefits with this change:
> 1. All KVM VM/VCPU stats can be read out in a bulk by one copy
>    to userspace.
> 2. A schema is used to describe KVM statistics. From userspace's
>    perspective, the KVM statistics are self-describing.
> 3. With the fd-based solution, a separate telemetry would be able
>    to read KVM stats in a less privileged environment.
> 4. After the initial setup by reading in stats descriptors, a
>    telemetry only needs to read the stats data itself, no more
>    parsing or setup is needed.
> 
> Reviewed-by: David Matlack <dmatlack@xxxxxxxxxx>
> Reviewed-by: Ricardo Koller <ricarkol@xxxxxxxxxx>
> Reviewed-by: Krish Sadhukhan <krish.sadhukhan@xxxxxxxxxx>
> Reviewed-by: Fuad Tabba <tabba@xxxxxxxxxx>
> Tested-by: Fuad Tabba <tabba@xxxxxxxxxx> #arm64
> Signed-off-by: Jing Zhang <jingzhangos@xxxxxxxxxx>
> ---
>  arch/arm64/kvm/Makefile   |   2 +-
>  arch/mips/kvm/Makefile    |   2 +-
>  arch/powerpc/kvm/Makefile |   2 +-
>  arch/s390/kvm/Makefile    |   3 +-
>  arch/x86/kvm/Makefile     |   2 +-
>  include/linux/kvm_host.h  | 145 ++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h  |  42 +++++++++++
>  virt/kvm/binary_stats.c   | 130 ++++++++++++++++++++++++++++++++++
>  8 files changed, 323 insertions(+), 5 deletions(-)
>  create mode 100644 virt/kvm/binary_stats.c
> 
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 589921392cb1..989bb5dad2c8 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
>  obj-$(CONFIG_KVM) += hyp/
>  
>  kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
> -	 $(KVM)/vfio.o $(KVM)/irqchip.o \
> +	 $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
>  	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
>  	 inject_fault.o va_layout.o handle_exit.o \
>  	 guest.o debug.o reset.o sys_regs.o \
> diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
> index 30cc060857c7..c67250a956b8 100644
> --- a/arch/mips/kvm/Makefile
> +++ b/arch/mips/kvm/Makefile
> @@ -2,7 +2,7 @@
>  # Makefile for KVM support for MIPS
>  #
>  
> -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o)
> +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
>  
>  EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
>  
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 2bfeaa13befb..b347d043b932 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -6,7 +6,7 @@
>  ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
>  KVM := ../../../virt/kvm
>  
> -common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
> +common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
>  common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
>  common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
>  
> diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
> index 12decca22e7c..b3aaadc60ead 100644
> --- a/arch/s390/kvm/Makefile
> +++ b/arch/s390/kvm/Makefile
> @@ -4,7 +4,8 @@
>  # Copyright IBM Corp. 2008
>  
>  KVM := ../../../virt/kvm
> -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
> +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o \
> +	      $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
>  
>  ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
>  
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 83331376b779..75dfd27b6e8a 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -11,7 +11,7 @@ KVM := ../../../virt/kvm
>  
>  kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
>  				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
> -				$(KVM)/dirty_ring.o
> +				$(KVM)/dirty_ring.o $(KVM)/binary_stats.o
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>  
>  kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5a31e0696360..2f0d12064ae7 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1272,6 +1272,12 @@ struct kvm_stats_debugfs_item {
>  	int mode;
>  };
>  
> +#define KVM_STATS_NAME_LEN	48
> +struct _kvm_stats_desc {
> +	struct kvm_stats_desc desc;
> +	char name[KVM_STATS_NAME_LEN];
> +};
> +
>  #define KVM_DBGFS_GET_MODE(dbgfs_item)                                         \
>  	((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
>  
> @@ -1285,8 +1291,147 @@ struct kvm_stats_debugfs_item {
>  	{ n, offsetof(struct kvm_vcpu, stat.generic.x),			       \
>  	  KVM_STAT_VCPU, ## __VA_ARGS__ }
>  
> +#define STATS_DESC_COMMON(type, unit, base, exp)			       \
> +	.flags = type | unit | base |					       \
> +	    BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |		       \
> +	    BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |		       \
> +	    BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),		       \
> +	.exponent = exp,						       \
> +	.size = 1
> +
> +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
> +	{								       \
> +		{							       \
> +			STATS_DESC_COMMON(type, unit, base, exp),	       \
> +			.offset = offsetof(struct kvm_vm_stat, generic.stat)   \
> +		},							       \
> +		.name = #stat,						       \
> +	}
> +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
> +	{								       \
> +		{							       \
> +			STATS_DESC_COMMON(type, unit, base, exp),	       \
> +			.offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
> +		},							       \
> +		.name = #stat,						       \
> +	}
> +#define VM_STATS_DESC(stat, type, unit, base, exp)			       \
> +	{								       \
> +		{							       \
> +			STATS_DESC_COMMON(type, unit, base, exp),	       \
> +			.offset = offsetof(struct kvm_vm_stat, stat)	       \
> +		},							       \
> +		.name = #stat,						       \
> +	}
> +#define VCPU_STATS_DESC(stat, type, unit, base, exp)			       \
> +	{								       \
> +		{							       \
> +			STATS_DESC_COMMON(type, unit, base, exp),	       \
> +			.offset = offsetof(struct kvm_vcpu_stat, stat)	       \
> +		},							       \
> +		.name = #stat,						       \
> +	}
> +/* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
> +#define STATS_DESC(SCOPE, stat, type, unit, base, exp)			       \
> +	SCOPE##_STATS_DESC(stat, type, unit, base, exp)
> +
> +#define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)	       \
> +	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE,		       \
> +		      unit, base, exponent)
> +#define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)		       \
> +	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent)  \
> +
> +/* Cumulative counter */
> +#define STATS_DESC_COUNTER(SCOPE, name)					       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE,		       \
> +		KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous counter */
> +#define STATS_DESC_ICOUNTER(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE,		       \
> +		KVM_STATS_BASE_POW10, 0)
> +
> +/* Cumulative clock cycles */
> +#define STATS_DESC_CYCLE(SCOPE, name)					       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_CYCLES,	       \
> +		KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous clock cycles */
> +#define STATS_DESC_ICYCLE(SCOPE, name)					       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_CYCLES,		       \
> +		KVM_STATS_BASE_POW10, 0)
> +
> +/* Cumulative memory size in Byte */
> +#define STATS_DESC_SIZE_BYTE(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,	       \
> +		KVM_STATS_BASE_POW2, 0)
> +/* Cumulative memory size in KiByte */
> +#define STATS_DESC_SIZE_KBYTE(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,	       \
> +		KVM_STATS_BASE_POW2, 10)
> +/* Cumulative memory size in MiByte */
> +#define STATS_DESC_SIZE_MBYTE(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,	       \
> +		KVM_STATS_BASE_POW2, 20)
> +/* Cumulative memory size in GiByte */
> +#define STATS_DESC_SIZE_GBYTE(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,	       \
> +		KVM_STATS_BASE_POW2, 30)
> +
> +/* Instantaneous memory size in Byte */
> +#define STATS_DESC_ISIZE_BYTE(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,		       \
> +		KVM_STATS_BASE_POW2, 0)
> +/* Instantaneous memory size in KiByte */
> +#define STATS_DESC_ISIZE_KBYTE(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,		       \
> +		KVM_STATS_BASE_POW2, 10)
> +/* Instantaneous memory size in MiByte */
> +#define STATS_DESC_ISIZE_MBYTE(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,		       \
> +		KVM_STATS_BASE_POW2, 20)
> +/* Instantaneous memory size in GiByte */
> +#define STATS_DESC_ISIZE_GBYTE(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,		       \
> +		KVM_STATS_BASE_POW2, 30)
> +
> +/* Cumulative time in second */
> +#define STATS_DESC_TIME_SEC(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
> +		KVM_STATS_BASE_POW10, 0)
> +/* Cumulative time in millisecond */
> +#define STATS_DESC_TIME_MSEC(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
> +		KVM_STATS_BASE_POW10, -3)
> +/* Cumulative time in microsecond */
> +#define STATS_DESC_TIME_USEC(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
> +		KVM_STATS_BASE_POW10, -6)
> +/* Cumulative time in nanosecond */
> +#define STATS_DESC_TIME_NSEC(SCOPE, name)				       \
> +	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
> +		KVM_STATS_BASE_POW10, -9)
> +
> +/* Instantaneous time in second */
> +#define STATS_DESC_ITIME_SEC(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,		       \
> +		KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous time in millisecond */
> +#define STATS_DESC_ITIME_MSEC(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,		       \
> +		KVM_STATS_BASE_POW10, -3)
> +/* Instantaneous time in microsecond */
> +#define STATS_DESC_ITIME_USEC(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,		       \
> +		KVM_STATS_BASE_POW10, -6)
> +/* Instantaneous time in nanosecond */
> +#define STATS_DESC_ITIME_NSEC(SCOPE, name)				       \
> +	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,		       \
> +		KVM_STATS_BASE_POW10, -9)
> +
>  extern struct kvm_stats_debugfs_item debugfs_entries[];
>  extern struct dentry *kvm_debugfs_dir;
> +ssize_t kvm_stats_read(char *id, struct kvm_stats_header *header,
> +		struct _kvm_stats_desc *desc, void *stats, size_t size_stats,
> +		char __user *user_buffer, size_t size, loff_t *offset);
>  
>  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 9febe1412f7a..ab73e905105c 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1086,6 +1086,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_HYPERV_ENFORCE_CPUID 199
>  #define KVM_CAP_SREGS2 200
>  #define KVM_CAP_EXIT_HYPERCALL 201
> +#define KVM_CAP_BINARY_STATS_FD 202
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1905,4 +1906,45 @@ struct kvm_dirty_gfn {
>  #define KVM_BUS_LOCK_DETECTION_OFF             (1 << 0)
>  #define KVM_BUS_LOCK_DETECTION_EXIT            (1 << 1)
>  
> +#define KVM_STATS_ID_MAXLEN		64
> +
> +struct kvm_stats_header {
> +	__u32 name_size;
> +	__u32 count;
> +	__u32 desc_offset;
> +	__u32 data_offset;
> +	char id[];
> +};

You mentioned before that the size of this really is the size of the
structure + KVM_STATS_ID_MAXLEN, right?  Or is it - KVM_STATS_ID_MAXLEN?

If so, why not put that value explicitly in:
	char id[THE_REST_OF_THE_HEADER_SPACE];

As this is not a variable header size at all, and you can not change it
going forward, so the variable length array here feels disingenuous.



> +
> +#define KVM_STATS_TYPE_SHIFT		0
> +#define KVM_STATS_TYPE_MASK		(0xF << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_CUMULATIVE	(0x0 << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_INSTANT		(0x1 << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_MAX		KVM_STATS_TYPE_INSTANT
> +
> +#define KVM_STATS_UNIT_SHIFT		4
> +#define KVM_STATS_UNIT_MASK		(0xF << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_NONE		(0x0 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
> +
> +#define KVM_STATS_BASE_SHIFT		8
> +#define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_POW10		(0x0 << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_POW2		(0x1 << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_MAX		KVM_STATS_BASE_POW2
> +
> +struct kvm_stats_desc {
> +	__u32 flags;
> +	__s16 exponent;
> +	__u16 size;
> +	__u32 offset;
> +	__u32 unused;
> +	char name[];
> +};

What is the max length of name?

Why aren't these structures defined here in kerneldoc so that we can
understand them better?  Putting them in a .rst file guarantees they
will get out of sync, and you can always directly import the kerneldoc
into the .rst file.

thanks,

greg k-h



[Index of Archives]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux