Re: [PATCH 2/4] Implement H_SET_MODE for ppc64le

Balbir Singh <bsingharora@xxxxxxxxx> · Wed, 30 Mar 2016 22:08:10 +1100



On 30/03/16 16:39, Michael Ellerman wrote:
> Hi Balbir,
>
> So I got this running and it seems to work well.
>
> I have some comments on the implementation though, see below ...
>
> On Mon, 2016-03-21 at 18:17 +1100, Balbir Singh wrote:
>
>> Basic infrastructure for queuing a task to a specifici CPU and
>> the use of that in setting ILE (Little Endian Interrupt Handling)
>> on power via h_set_mode hypercall
>>
>> Signed-off-by: Balbir Singh <bsingharora@xxxxxxxxx>
>> diff --git a/include/kvm/kvm.h b/include/kvm/kvm.h
>> index 37155db..731abee 100644
>> --- a/include/kvm/kvm.h
>> +++ b/include/kvm/kvm.h
>> @@ -15,6 +15,7 @@
>>  
>>  #define SIGKVMEXIT		(SIGRTMIN + 0)
>>  #define SIGKVMPAUSE		(SIGRTMIN + 1)
>> +#define SIGKVMTASK		(SIGRTMIN + 2)
>>  
>>  #define KVM_PID_FILE_PATH	"/.lkvm/"
>>  #define HOME_DIR		getenv("HOME")
>> diff --git a/kvm-cpu.c b/kvm-cpu.c
>> index ad4441b..438414f 100644
>> --- a/kvm-cpu.c
>> +++ b/kvm-cpu.c
>> @@ -83,10 +83,59 @@ void kvm_cpu__reboot(struct kvm *kvm)
>>  	}
>>  }
>>  
>> +static void kvm_cpu__run_task(int sig, siginfo_t * info, void *context)
>> +{
>> +	union sigval val;
>> +	struct kvm_cpu_task *task_ptr;
>> +
>> +	if (!info) {
>> +		pr_warning("signal queued without info\n");
>> +		return;
>> +	}
>> +
>> +	val = info->si_value;
>> +	task_ptr = val.sival_ptr;
>> +	if (!task_ptr) {
>> +		pr_warning("Task queued without data\n");
>> +		return;
>> +	}
>> +
>> +	if (!task_ptr->task || !task_ptr->data) {
>> +		pr_warning("Failed to get task information\n");
>> +		return;
>> +	}
>> +
>> +	task_ptr->task(task_ptr->data);
>> +	free(task_ptr);
>> +}
> I don't think it's safe to do the actual task call from signal context. Rather
> it should set a flag that the main loop detects and then runs the task there.
I don't think it matters. We do other stuff like kvm__pause() from signal context.
The problem I see with the main loop detection is that we could potentially have
both modes queued up

>> +int kvm_cpu__queue_task(struct kvm_cpu *cpu, void (*task)(void *data),
>> +			void *data)
>> +{
>> +	struct kvm_cpu_task *task_ptr = NULL;
>> +	union sigval val;
>> +
>> +	task_ptr = malloc(sizeof(struct kvm_cpu_task));
>> +	if (!task_ptr)
>> +		return -ENOMEM;
>> +
>> +	task_ptr->task = task;
>> +	task_ptr->data = data;
>> +	val.sival_ptr = task_ptr;
>> +
>> +	pthread_sigqueue(cpu->thread, SIGKVMTASK, val);
>> +	return 0;
>> +}
> I think it would be nicer if this interface dealt with waiting for the
> response. Rather than the caller having to do it.
>
> Possibly in future we'll want to do an async task, but we can refactor the code
> then to skip doing the wait.
I wrote the core code (powerpc independent bits) to be async with an example of
our code of how to do sync.
>> diff --git a/powerpc/include/kvm/kvm-cpu-arch.h b/powerpc/include/kvm/kvm-cpu-arch.h
>> index 01eafdf..033b702 100644
>> --- a/powerpc/include/kvm/kvm-cpu-arch.h
>> +++ b/powerpc/include/kvm/kvm-cpu-arch.h
>> @@ -38,6 +38,8 @@
>>  
>>  #define POWER7_EXT_IRQ	0
>>  
>> +#define LPCR_ILE (1 << (63-38))
>> +
>>  struct kvm;
>>  
>>  struct kvm_cpu {
>> diff --git a/powerpc/spapr.h b/powerpc/spapr.h
>> index 8b294d1..f851f4a 100644
>> --- a/powerpc/spapr.h
>> +++ b/powerpc/spapr.h
>> @@ -27,7 +27,7 @@ typedef uintptr_t target_phys_addr_t;
>>  #define H_HARDWARE	-1	/* Hardware error */
>>  #define H_FUNCTION	-2	/* Function not supported */
>>  #define H_PARAMETER	-4	/* Parameter invalid, out-of-range or conflicting */
>> -
>> +#define H_P2		-55
>>  #define H_SET_DABR		0x28
>>  #define H_LOGICAL_CI_LOAD	0x3c
>>  #define H_LOGICAL_CI_STORE	0x40
>> @@ -41,7 +41,18 @@ typedef uintptr_t target_phys_addr_t;
>>  #define H_EOI			0x64
>>  #define H_IPI			0x6c
>>  #define H_XIRR			0x74
>> -#define MAX_HCALL_OPCODE	H_XIRR
>> +#define H_SET_MODE		0x31C
>> +#define MAX_HCALL_OPCODE	H_SET_MODE
>> +
>> +/* Values for 2nd argument to H_SET_MODE */
>> +#define H_SET_MODE_RESOURCE_SET_CIABR		1
>> +#define H_SET_MODE_RESOURCE_SET_DAWR		2
>> +#define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE	3
>> +#define H_SET_MODE_RESOURCE_LE			4
>> +
>> +/* Flags for H_SET_MODE_RESOURCE_LE */
>> +#define H_SET_MODE_ENDIAN_BIG		0
>> +#define H_SET_MODE_ENDIAN_LITTLE	1
>>  
>>  /*
>>   * The hcalls above are standardized in PAPR and implemented by pHyp
>> diff --git a/powerpc/spapr_hcall.c b/powerpc/spapr_hcall.c
>> index ff1d63a..682fad5 100644
>> --- a/powerpc/spapr_hcall.c
>> +++ b/powerpc/spapr_hcall.c
>> @@ -18,6 +18,9 @@
>>  
>>  #include <stdio.h>
>>  #include <assert.h>
>> +#include <sys/eventfd.h>
>> +
>> +static int task_event;
>>  
>>  static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
>>  static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX -
>> @@ -74,6 +77,113 @@ static target_ulong h_logical_dcbf(struct kvm_cpu *vcpu, target_ulong opcode, ta
>>  	return H_SUCCESS;
>>  }
>>  
>> +struct lpcr_data {
>> +	struct kvm_cpu	*cpu;
>> +	int		mode;
>> +};
>> +
>> +static int get_cpu_lpcr(struct kvm_cpu *vcpu, target_ulong *lpcr)
>> +{
>> +	struct kvm_one_reg reg = {
>> +		.id = KVM_REG_PPC_LPCR_64,
>> +		.addr = (__u64)lpcr
>> +	};
>> +
>> +	return ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg);
>> +}
>> +
>> +static int set_cpu_lpcr(struct kvm_cpu *vcpu, target_ulong *lpcr)
> This function has a reasonable name ..
>
>> +{
>> +	struct kvm_one_reg reg = {
>> +		.id = KVM_REG_PPC_LPCR_64,
>> +		.addr = (__u64)lpcr
>> +	};
>> +
>> +	return ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg);
>> +}
>> +
>> +static void set_lpcr_cpu(void *data)
> But then this one is *very* similar.
>
> I think this should actually be called set_cpu_ile(), because that's what it
> does. And maybe have "task" in the name because it's the version for using with
> kvm_cpu__queue_task().
I'll change this to set_cpu_ile -- good catch
>
>> +{
>> +	struct lpcr_data *fn_data = (struct lpcr_data *)data;
>> +	int ret;
>> +	target_ulong lpcr;
>> +	u64 task_done = 1;
>> +
>> +	if (!fn_data || !fn_data->cpu)
>> +		return;
> This should be hard errors IMHO.
I wanted to avoid hard errors to see if the OS can fail with an OOPS
later. My concern is that hard errors always leave the system in a very
bad state with no scope to debug. I can change that if required
>> +	ret = get_cpu_lpcr(fn_data->cpu, &lpcr);
>> +	if (ret < 0)
>> +		return;
> Uh oh!
>
> It looks like most code calls die() if KVM_SET_ONE_REG fails, that would be
> preferable I think than running some cpus with a different endian :)

>> +	if (fn_data->mode == H_SET_MODE_ENDIAN_BIG)
>> +		lpcr &= ~LPCR_ILE;
>> +	else
>> +		lpcr |= LPCR_ILE;
>> +
>> +	ret = set_cpu_lpcr(fn_data->cpu, &lpcr);
>> +	if (ret < 0)
>> +		return;
>> +
>> +	free(data);
> I don't think we should be doing the free here.
>From the point the callback gets the data, it owns it. Otherwise we'd have to
implement an I am done with this processing

>> +	if (write(task_event, &task_done, sizeof(task_done)) < 0)
>> +		pr_warning("Failed to notify of lpcr task done\n");
>> +}
>> +
>> +#define for_each_vcpu(cpu, kvm, i) \
>> +	for ((i) = 0, (cpu) = (kvm)->cpus[i]; (i) < (kvm)->nrcpus; (i)++, (cpu) = (kvm)->cpus[i])
> That should probably be in a header.
Yep, I did not see any use outside this function, but I can move it
>> +static target_ulong h_set_mode(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
>> +{
>> +	int ret = H_SUCCESS;
> That init should be removed.
OK, I'll revisit the error handling
>> +	struct kvm *kvm = vcpu->kvm;
>> +	struct kvm_cpu *cpu;
>> +	int i;
>> +
>> +	switch (args[1]) {
>> +	case H_SET_MODE_RESOURCE_LE: {
>> +		u64 total_done = 0;
>> +		u64 task_read;
>> +
>> +		task_event = eventfd(0, 0);
>> +		if (task_event < 0) {
>> +			pr_warning("Failed to create task_event");
>> +			break;
> That will return H_SUCCESS which is not OK.
Good catch!
>> +		}
>> +		for_each_vcpu(cpu, kvm, i) {
>> +			struct lpcr_data *data;
>> +
>> +			data = malloc(sizeof(struct lpcr_data));
> Is there any reason not to do this synchronously?
>
> That would allow you to put data on the stack. And also avoid the while loop
> below.
Synchronous implies a two way communication mechanism between this vCPU
and all others to communicate begin and end of a task
>
>> +			if (!data) {
>> +				ret = H_P2;
>> +				break;
>> +			}
>> +			data->cpu = cpu;
>> +			data->mode = args[0];
>> +
>> +			kvm_cpu__queue_task(cpu, set_lpcr_cpu, data);
>> +		}
>> +
>> +		while ((int)total_done < kvm->nrcpus) {
>> +			int err;
>> +			err = read(task_event, &task_read, sizeof(task_read));
>> +			if (err < 0) {
>> +				ret = H_P2;
>> +				break;
>> +			}
>> +			total_done += task_read;
>> +		}
>> +		close(task_event);
>> +		break;
>> +	}
>> +	default:
>> +		ret = H_FUNCTION;
>> +		break;
>> +	}
>> +	return (ret < 0) ? H_P2 : H_SUCCESS;
>> +}
> I think that ends up being correct, but it's pretty obscure. ie. for an
> unsupported resource we should return H_P2, and you get that to happen by
> setting ret to H_FUNCTION (-2).
Good catch! I'll fix this
>
> cheers
>

Thanks for the review
Balbir
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html