The 2nd patch is to change the definition of perf_event to facilitate perf attr copy when a hypercall happens. Signed-off-by: Zhang Yanmin <yanmin_zhang@xxxxxxxxxxxxxxx> --- --- linux-2.6_tip0620/include/linux/perf_event.h 2010-06-21 15:19:52.821999849 +0800 +++ linux-2.6_tip0620perfkvm/include/linux/perf_event.h 2010-06-21 16:53:49.283999849 +0800 @@ -188,7 +188,10 @@ struct perf_event_attr { __u64 sample_type; __u64 read_format; - __u64 disabled : 1, /* off by default */ + union { + __u64 flags; + struct { + __u64 disabled : 1, /* off by default */ inherit : 1, /* children inherit it */ pinned : 1, /* must always be on PMU */ exclusive : 1, /* only group on PMU */ @@ -217,6 +220,8 @@ struct perf_event_attr { mmap_data : 1, /* non-exec mmap data */ __reserved_1 : 46; + }; + }; union { __u32 wakeup_events; /* wakeup every n events */ @@ -465,12 +470,6 @@ enum perf_callchain_context { # include <asm/local64.h> #endif -struct perf_guest_info_callbacks { - int (*is_in_guest) (void); - int (*is_user_mode) (void); - unsigned long (*get_guest_ip) (void); -}; - #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <asm/hw_breakpoint.h> #endif @@ -753,6 +752,20 @@ struct perf_event { perf_overflow_handler_t overflow_handler; + /* + * pointers used by kvm perf paravirt interface. + * + * 1) Used in host kernel and points to host_perf_shadow which + * has information about guest perf_event + */ + void *host_perf_shadow; + /* + * 2) Used in guest kernel and points to guest_perf_shadow which + * is used as a communication area with host kernel. Host kernel + * copies overflow data to it when an event overflows. + */ + void *guest_perf_shadow; + #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call *tp_event; struct event_filter *filter; @@ -838,6 +851,16 @@ struct perf_output_handle { int sample; }; +struct perf_guest_info_callbacks { + /* Support collect guest statistics from host side */ + int (*is_in_guest) (void); + int (*is_user_mode) (void); + unsigned long (*get_guest_ip) (void); + + /* Support paravirt interface */ + void (*copy_event_to_shadow) (struct perf_event *event, int overflows); +}; + #ifdef CONFIG_PERF_EVENTS /* @@ -871,6 +894,10 @@ perf_event_create_kernel_counter(struct perf_overflow_handler_t callback); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); +extern void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, struct pt_regs *regs); +void perf_event_attach(struct perf_event *event); +void perf_event_detach(struct perf_event *event); struct perf_sample_data { u64 type; @@ -1023,6 +1050,14 @@ perf_event_task_sched_in(struct task_str static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { } + +static inline void +perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, struct pt_regs *regs) { } + +static inline void perf_event_attach(struct perf_event *event) { } +static inline void perf_event_detach(struct perf_event *event) { } + static inline void perf_event_task_tick(struct task_struct *task) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } --- linux-2.6_tip0620/kernel/watchdog.c 2010-06-21 15:20:48.517999849 +0800 +++ linux-2.6_tip0620perfkvm/kernel/watchdog.c 2010-06-21 15:21:39.315999849 +0800 @@ -197,8 +197,6 @@ static struct perf_event_attr wd_hw_attr .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, }; /* Callback function for perf event subsystem */ @@ -361,6 +359,8 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(); + wd_attr->pinned = 1; + wd_attr->disabled = 1; event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); --- linux-2.6_tip0620/kernel/perf_event.c 2010-06-21 15:20:49.013999849 +0800 +++ linux-2.6_tip0620perfkvm/kernel/perf_event.c 2010-06-21 16:52:35.432999849 +0800 @@ -32,6 +32,7 @@ #include <linux/perf_event.h> #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> +#include <linux/kvm_para.h> #include <asm/irq_regs.h> @@ -747,6 +748,7 @@ static int group_can_go_on(struct perf_e */ if (event->attr.exclusive && cpuctx->active_oncpu) return 0; + /* * Otherwise, try to add it if all previous groups were able * to go on. @@ -1613,6 +1615,7 @@ void perf_event_task_tick(struct task_st struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; int rotate = 0; + int adjust_freq = 1; if (!atomic_read(&nr_events)) return; @@ -1626,9 +1629,22 @@ void perf_event_task_tick(struct task_st if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) rotate = 1; - perf_ctx_adjust_freq(&cpuctx->ctx); - if (ctx) - perf_ctx_adjust_freq(ctx); +#ifdef CONFIG_KVM_PERF + if (kvm_para_available()) { + /* + * perf_ctx_adjust_freq causes lots of pmu->read which would + * trigger too many vmexit to host kernel. We disable it + * under para virt situation + */ + adjust_freq = 0; + } +#endif + + if (adjust_freq) { + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + } if (!rotate) return; @@ -3434,7 +3450,7 @@ void perf_prepare_sample(struct perf_eve } } -static void perf_event_output(struct perf_event *event, int nmi, +void perf_event_output(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { @@ -5261,6 +5277,47 @@ perf_event_create_kernel_counter(struct } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +void perf_event_attach(struct perf_event *event) +{ + struct perf_event_context *old_ctx, *new_ctx; + + old_ctx = event->ctx; + new_ctx = find_get_context(current->pid, -1); + if (old_ctx != new_ctx) { + if (old_ctx) { + /* Delete from old ctx before joining new ctx */ + mutex_lock(&old_ctx->mutex); + raw_spin_lock(&old_ctx->lock); + list_del_event(event, old_ctx); + raw_spin_unlock(&old_ctx->lock); + mutex_unlock(&old_ctx->mutex); + put_ctx(old_ctx); + } + + mutex_lock(&new_ctx->mutex); + raw_spin_lock(&new_ctx->lock); + list_add_event(event, new_ctx); + event->ctx = new_ctx; + raw_spin_unlock(&new_ctx->lock); + mutex_unlock(&new_ctx->mutex); + } else + put_ctx(new_ctx); + + perf_event_enable(event); +} +EXPORT_SYMBOL_GPL(perf_event_attach); + +void perf_event_detach(struct perf_event *event) +{ + /* + * Just disable the event and don't del it from + * ctx->event_list in case there is a race condition + * with perf_event_read_value + */ + perf_event_disable(event); +} +EXPORT_SYMBOL_GPL(perf_event_detach); + /* * inherit a event from parent task to child task: */ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html