Re: [PATCH 1/3] drm/amdkfd: Allocate SDMA engines more fairly

Felix Kuehling <felix.kuehling@xxxxxxx> · Fri, 20 Aug 2021 18:03:37 -0400

Am 2021-08-20 um 1:32 a.m. schrieb Joseph Greathouse:
> Give every process at most one queue from each SDMA engine.
> Previously, we allocated all SDMA engines and queues on a first-
> come-first-serve basis. This meant that it was possible for two
> processes racing on their allocation requests to each end up with
> two queues on the same SDMA engine. That could serialize the
> performance of commands to different queues.
>
> This new mechanism allows each process at most a single queue
> on each SDMA engine. Processes will check for queue availability on
> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> engine if there is an available queue slot. If there are no
> queue slots available (or if this process has already allocated
> a queue on this engine), it moves on and tries the next engine.
>
> The Aldebaran chip has a small quirk that SDMA0 should only be
> used by the very first allocation from each process. It is OK to
> use any other SDMA engine at any time, but after the first SDMA
> allocation request from a process, the resulting engine must
> be from SDMA1 or above. This patch handles this case as well.
>
> Signed-off-by: Joseph Greathouse <Joseph.Greathouse@xxxxxxx>

One final nit-pick inline. With that fixed, the series is

Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>

> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>  3 files changed, 107 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f8fce9d05f50..86bdb765f350 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>  static int map_queues_cpsch(struct device_queue_manager *dqm);
>  
>  static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q);
>  
>  static inline void deallocate_hqd(struct device_queue_manager *dqm,
>  				struct queue *q);
>  static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
>  static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q);
>  static void kfd_process_hw_exception(struct work_struct *work);
>  
> @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>  			q->pipe, q->queue);
>  	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>  		if (retval)
>  			goto deallocate_vmid;
>  		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>  		deallocate_hqd(dqm, q);
>  	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  deallocate_vmid:
>  	if (list_empty(&qpd->queues_list))
>  		deallocate_vmid(dqm, qpd, q);
> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>  	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>  		deallocate_hqd(dqm, q);
>  	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  	else {
>  		pr_debug("q->properties.type %d is invalid\n",
>  				q->properties.type);
> @@ -1039,42 +1041,93 @@ static void pre_reset(struct device_queue_manager *dqm)
>  	dqm_unlock(dqm);
>  }
>  
> +static uint64_t sdma_engine_mask(int engine, int num_engines)
> +{
> +	uint64_t mask = 0;
> +
> +	engine %= num_engines;
> +	while (engine < (sizeof(uint64_t)*8)) {
> +		mask |= 1ULL << engine;
> +		engine += num_engines;
> +	}
> +	return mask;
> +}
> +
>  static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q)
>  {
> -	int bit;
> +	uint64_t available_queue_bitmap;
> +	unsigned int bit, engine, num_engines;
> +	bool found_sdma = false;
>  
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> -		if (dqm->sdma_bitmap == 0) {
> +		num_engines = get_num_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->sdma_engine_bitmap), num_engines) {
> +			/* Do not reuse SDMA0 for any subsequent SDMA queue
> +			 * requests on Aldebaran. If SDMA0's queues are all
> +			 * full, then this process should never use SDMA0
> +			 * for any further requests
> +			 */
> +			if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0)
> +				qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			q->properties.sdma_engine_id = q->sdma_id % num_engines;
> +			q->properties.sdma_queue_id = q->sdma_id / num_engines;
> +			break;
> +		}
> +		if (!found_sdma) {
>  			pr_err("No more SDMA queue to allocate\n");
>  			return -ENOMEM;
>  		}
> -
> -		bit = __ffs64(dqm->sdma_bitmap);
> -		dqm->sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		q->properties.sdma_engine_id = q->sdma_id %
> -				get_num_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_sdma_engines(dqm);
>  	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		if (dqm->xgmi_sdma_bitmap == 0) {
> +		num_engines = get_num_xgmi_sdma_engines(dqm);
> +		for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), num_engines) {
> +			available_queue_bitmap = sdma_engine_mask(engine, num_engines);
> +			available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> +
> +			if (!available_queue_bitmap)
> +				continue;
> +			/* Take the selected engine off the list so we will not
> +			 * allocate two queues onto the same engine
> +			 */
> +			qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> +			found_sdma = true;
> +
> +			bit = __ffs64(available_queue_bitmap);
> +			dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> +			q->sdma_id = bit;
> +			/* sdma_engine_id is sdma id including
> +			 * both PCIe-optimized SDMAs and XGMI-
> +			 * optimized SDMAs. The calculation below
> +			 * assumes the first N engines are always
> +			 * PCIe-optimized ones
> +			 */
> +			q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> +				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> +			q->properties.sdma_queue_id = q->sdma_id /
> +				get_num_xgmi_sdma_engines(dqm);

You could use "num_engines" here instead of get_num_xgmi_sdma_engines(dqm).

Regards,
  Felix

> +			break;
> +		}
> +		if (!found_sdma) {
>  			pr_err("No more XGMI SDMA queue to allocate\n");
>  			return -ENOMEM;
>  		}
> -		bit = __ffs64(dqm->xgmi_sdma_bitmap);
> -		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> -		q->sdma_id = bit;
> -		/* sdma_engine_id is sdma id including
> -		 * both PCIe-optimized SDMAs and XGMI-
> -		 * optimized SDMAs. The calculation below
> -		 * assumes the first N engines are always
> -		 * PCIe-optimized ones
> -		 */
> -		q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> -				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> -		q->properties.sdma_queue_id = q->sdma_id /
> -				get_num_xgmi_sdma_engines(dqm);
>  	}
>  
>  	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>  }
>  
>  static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct qcm_process_device *qpd,
>  				struct queue *q)
>  {
> +	uint32_t engine = q->properties.sdma_engine_id;
> +
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>  		if (q->sdma_id >= get_num_sdma_queues(dqm))
>  			return;
>  		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* Don't give SDMA0 back to be reallocated on Aldebaran.
> +		 * It is only OK to use this engine from the first allocation
> +		 * within a process.
> +		 */
> +		if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +					engine == 0))
> +			qpd->sdma_engine_bitmap |= (1ULL << engine);
> +
>  	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>  		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>  			return;
>  		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> +		/* engine ID in the queue properties is the global engine ID.
> +		 * The XGMI engine bitmap ignores the PCIe-optimized engines.
> +		 */
> +		engine -= get_num_sdma_engines(dqm);
> +		qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>  	}
>  }
>  
> @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>  		dqm_lock(dqm);
> -		retval = allocate_sdma_queue(dqm, q);
> +		retval = allocate_sdma_queue(dqm, qpd, q);
>  		dqm_unlock(dqm);
>  		if (retval)
>  			goto out;
> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>  	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>  		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>  		dqm_lock(dqm);
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  		dqm_unlock(dqm);
>  	}
>  out:
> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  
>  	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>  	    (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue(dqm, qpd, q);
>  		pdd->sdma_past_activity_counter += sdma_val;
>  	}
>  
> @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>  	/* Clear all user mode queues */
>  	list_for_each_entry(q, &qpd->queues_list, list) {
>  		if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>  		else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue(dqm, qpd, q);
>  
>  		if (q->properties.is_active) {
>  			decrement_queue_count(dqm, q->properties.type);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ab83b0de6b22..c38eebc9db4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -576,6 +576,8 @@ struct qcm_process_device {
>  	struct list_head priv_queue_list;
>  
>  	unsigned int queue_count;
> +	unsigned long sdma_engine_bitmap;
> +	unsigned long xgmi_sdma_engine_bitmap;
>  	unsigned int vmid;
>  	bool is_debug;
>  	unsigned int evicted; /* eviction counter, 0=active */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 21ec8a18cad2..13c85624bf7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1422,6 +1422,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>  							struct kfd_process *p)
>  {
>  	struct kfd_process_device *pdd = NULL;
> +	const struct kfd_device_info *dev_info = dev->dqm->dev->device_info;
>  
>  	if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>  		return NULL;
> @@ -1446,6 +1447,8 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>  	pdd->qpd.pqm = &p->pqm;
>  	pdd->qpd.evicted = 0;
>  	pdd->qpd.mapped_gws_queue = false;
> +	pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
> +	pdd->qpd.xgmi_sdma_engine_bitmap = BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>  	pdd->process = p;
>  	pdd->bound = PDD_UNBOUND;
>  	pdd->already_dequeued = false;