Re: [PATCH V1 2/3] perf/x86/intel/bm.c: Add Intel Branch Monitoring support

Peter Zijlstra <peterz@xxxxxxxxxxxxx> · Mon, 13 Nov 2017 10:00:16 +0100

On Sat, Nov 11, 2017 at 01:20:05PM -0800, Megha Dey wrote:
> Currently, the cannonlake family of Intel processors support the
> branch monitoring feature. Intel's Branch monitoring feature is trying
> to utilize heuristics to detect the occurrence of an ROP (Return
> Oriented Programming) attack.
> 
> A perf-based kernel driver has been used to monitor the occurrence of
> one of the 6 branch monitoring events. There are 2 counters that each
> can select between one of these events for evaluation over a specified
> instruction window size (0 to 1023). For each counter, a threshold value
> (0 to 127) can be configured to set a point at which ROP detection event
> action is taken (determined by user-space). Each task can monitor
> a maximum of 2 events at any given time.
> 
> Apart from window_size(global) and threshold(per-counter), various sysfs
> entries are provided for the user to configure: guest_disable, lbr_freeze,
> window_cnt_sel, cnt_and_mode (all global) and mispred_evt_cnt(per-counter).
> For all events belonging to the same task, the global parameters are
> shared.

Is there any sensible documentation on this except the MSR listings?

> 
> Everytime a task is scheduled out, we save current window and count
> associated with the event being monitored. When the task is scheduled
> next, we start counting from previous count associated with this event.
> Thus, a full context switch in this case is not necessary.

What? That doesn't make any sense. The fact that we scheduled out and
then in again _is_ a full context switch no?

> 
> Signed-off-by: Megha Dey <megha.dey@xxxxxxxxxxxxxxx>
> Signed-off-by: Yu-Cheng Yu <yu-cheng.yu@xxxxxxxxx>

That SoB chain is buggered.

> +static int intel_bm_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
> +{
> +	struct perf_event *event;
> +	union bm_detect_status stat;
> +	int i;
> +	unsigned long x;
> +
> +	rdmsrl(BR_DETECT_STATUS_MSR, stat.raw);

	if (!stat.event)
		return NMI_DONE;

saves you a whole bunch of indentation, no?

> +
> +	if (stat.event) {
> +		wrmsrl(BR_DETECT_STATUS_MSR, 0);
> +		apic_write(APIC_LVTPC, APIC_DM_NMI);
> +		/*
> +		 * Issue wake-up to corresponding polling event
> +		 */
> +		x = stat.ctrl_hit;
> +		for_each_set_bit(i, &x, BM_MAX_COUNTERS) {
> +			event = current->thread.bm_counter_owner[i];
> +			local64_inc(&event->count);
> +			atomic_set(&event->hw.bm_poll, POLLIN);
> +			event->pending_wakeup = 1;
> +			irq_work_queue(&event->pending);
> +		}
> +		return NMI_HANDLED;
> +	}
> +	return NMI_DONE;
> +}
> +
> +/*
> + * Unmask the NMI bit of the local APIC the first time task is scheduled
> + * on a particular CPU.
> + */
> +static void intel_bm_unmask_nmi(void)
> +{
> +	this_cpu_write(bm_unmask_apic, 0);
> +
> +	if (!(this_cpu_read(bm_unmask_apic))) {
> +		apic_write(APIC_LVTPC, APIC_DM_NMI);
> +		this_cpu_inc(bm_unmask_apic);
> +	}
> +}

What? Why?

> +static int intel_bm_event_add(struct perf_event *event, int mode)
> +{
> +	union bm_detect_status cur_stat, prev_stat;
> +
> +	WARN_ON(event->hw.id >= BM_MAX_COUNTERS);
> +
> +	prev_stat.raw = local64_read(&event->hw.prev_count);
> +
> +	/*
> +	 * Start counting from previous count associated with this event
> +	 */
> +	rdmsrl(BR_DETECT_STATUS_MSR, cur_stat.raw);
> +
> +	cur_stat.count[event->hw.id] = prev_stat.count[event->hw.id];
> +	cur_stat.count_window = prev_stat.count_window;
> +	wrmsrl(BR_DETECT_STATUS_MSR, cur_stat.raw);

Why are you writing back the value you read? Just to waste cycles?

> +	wrmsrl(BR_DETECT_CONTROL_MSR, event->hw.bm_ctrl);
> +
> +	intel_bm_unmask_nmi();
> +
> +	wrmsrl(BR_DETECT_COUNTER_CONFIG_BASE + event->hw.id,
> +		(event->hw.bm_counter_conf | 1));

Please use a named construct for that enable bit.

> +
> +	return 0;
> +}
> +
> +static void intel_bm_event_update(struct perf_event *event)
> +{
> +	union bm_detect_status cur_stat;
> +
> +	rdmsrl(BR_DETECT_STATUS_MSR, cur_stat.raw);
> +	local64_set(&event->hw.prev_count, (uint64_t)cur_stat.raw);
> +}

That looks wrong... the general point of update functions is to update
the count, the above does not in fact do that.

> +
> +static void intel_bm_event_del(struct perf_event *event, int flags)
> +{
> +	WARN_ON(event->hw.id >= BM_MAX_COUNTERS);
> +
> +	wrmsrl(BR_DETECT_COUNTER_CONFIG_BASE + event->hw.id,
> +		(event->hw.bm_counter_conf & ~1));

Either that EN bit is part of the bm_counter_conf, in which case you
didn't need to add it in _add(), or its not and you don't need to clear
it here. Make up your mind.

> +
> +	intel_bm_event_update(event);

Except of course, that does not in fact update...

> +}
> +
> +static void intel_bm_event_destroy(struct perf_event *event)
> +{
> +	bm_counter_owner[event->hw.id] = NULL;
> +}
> +
> +static DEFINE_MUTEX(bm_counter_mutex);
> +
> +static int intel_bm_event_init(struct perf_event *event)
> +{
> +	u64 cfg;
> +	int counter_to_use = -1, i;
> +
> +	local64_set(&event->hw.prev_count, 0);
> +

> +	/*
> +	 * Find a hardware counter for the target task
> +	 */
> +	bm_counter_owner = event->hw.target->thread.bm_counter_owner;
> +
> +	mutex_lock(&bm_counter_mutex);
> +	for (i = 0; i < BM_MAX_COUNTERS; i++) {
> +		if (bm_counter_owner[i] == NULL) {
> +			counter_to_use = i;
> +			bm_counter_owner[i] = event;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&bm_counter_mutex);
> +
> +	if (counter_to_use == -1)
> +		return -EBUSY;
> +
> +	event->hw.bm_ctrl = (bm_window_size << BM_WINDOW_SIZE_SHIFT) |
> +			    (bm_guest_disable << BM_GUEST_DISABLE_SHIFT) |
> +			    (bm_lbr_freeze << BM_LBR_FREEZE_SHIFT) |
> +			    (bm_window_cnt_sel << BM_WINDOW_CNT_SEL_SHIFT) |
> +			    (bm_cnt_and_mode << BM_CNT_AND_MODE_SHIFT) |
> +								BM_ENABLE;
> +	event->hw.bm_counter_conf = (bm_threshold << BM_THRESHOLD_SHIFT) |
> +			(bm_mispred_evt_cnt << BM_MISPRED_EVT_CNT_SHIFT) |
> +					(cfg << BM_EVENT_TYPE_SHIFT);
> +
> +	event->hw.id = counter_to_use;
> +	local64_set(&event->count, 0);

That is just a really ugly hack to work around:

> +static struct pmu intel_bm_pmu = {
> +	.task_ctx_nr     = perf_sw_context,

this. And you didn't bother to mention that atrocity in your Changelog.

NAK. 
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html