[PATCH 03/14] drm/amdkfd: map multiple processes to HW scheduler

oded.gabbay@xxxxxxxxx (Oded Gabbay) · Tue, 5 Dec 2017 10:04:55 +0200



On Tue, Nov 28, 2017 at 1:29 AM, Felix Kuehling <Felix.Kuehling at amd.com> wrote:
> Allow HWS to to execute multiple processes on the hardware
> concurrently. The number of concurrent processes is limited by
> the number of VMIDs allocated to the HWS.
>
> A module parameter can be used for limiting this further or turn
> it off altogether (mainly for debugging purposes).
>
> Signed-off-by: Yong Zhao <yong.zhao at amd.com>
> Signed-off-by: Jay Cornwall <Jay.Cornwall at amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c         | 11 +++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_module.c         |  5 +++++
>  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 30 +++++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h           |  9 ++++++++
>  4 files changed, 53 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 4f05eac..a8fa33a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -238,6 +238,17 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>         kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
>                         - kfd->vm_info.first_vmid_kfd + 1;
>
> +       /* Verify module parameters regarding mapped process number*/
> +       if ((hws_max_conc_proc < 0)
> +                       || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
> +               dev_err(kfd_device,
> +                       "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
> +                       hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
> +                       kfd->vm_info.vmid_num_kfd);
> +               kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
> +       } else
> +               kfd->max_proc_per_quantum = hws_max_conc_proc;
> +
>         /* calculate max size of mqds needed for queues */
>         size = max_num_of_queues_per_device *
>                         kfd->device_info->mqd_size_aligned;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> index ee8adf6..4e060c8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> @@ -50,6 +50,11 @@ module_param(sched_policy, int, 0444);
>  MODULE_PARM_DESC(sched_policy,
>         "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
>
> +int hws_max_conc_proc = 8;
> +module_param(hws_max_conc_proc, int, 0444);
> +MODULE_PARM_DESC(hws_max_conc_proc,
> +       "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
> +
>  int cwsr_enable = 1;
>  module_param(cwsr_enable, int, 0444);
>  MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 69c147a..0b7092e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -57,13 +57,24 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
>  {
>         unsigned int process_count, queue_count;
>         unsigned int map_queue_size;
> +       unsigned int max_proc_per_quantum = 1;
> +       struct kfd_dev *dev = pm->dqm->dev;
>
>         process_count = pm->dqm->processes_count;
>         queue_count = pm->dqm->queue_count;
>
> -       /* check if there is over subscription*/
> +       /* check if there is over subscription
> +        * Note: the arbitration between the number of VMIDs and
> +        * hws_max_conc_proc has been done in
> +        * kgd2kfd_device_init().
> +        */
>         *over_subscription = false;
> -       if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) {
> +
> +       if (dev->max_proc_per_quantum > 1)
> +               max_proc_per_quantum = dev->max_proc_per_quantum;
> +
> +       if ((process_count > max_proc_per_quantum) ||
> +           queue_count > get_queues_num(pm->dqm)) {
>                 *over_subscription = true;
>                 pr_debug("Over subscribed runlist\n");
>         }
> @@ -116,10 +127,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
>                         uint64_t ib, size_t ib_size_in_dwords, bool chain)
>  {
>         struct pm4_mes_runlist *packet;
> +       int concurrent_proc_cnt = 0;
> +       struct kfd_dev *kfd = pm->dqm->dev;
>
>         if (WARN_ON(!ib))
>                 return -EFAULT;
>
> +       /* Determine the number of processes to map together to HW:
> +        * it can not exceed the number of VMIDs available to the
> +        * scheduler, and it is determined by the smaller of the number
> +        * of processes in the runlist and kfd module parameter
> +        * hws_max_conc_proc.
> +        * Note: the arbitration between the number of VMIDs and
> +        * hws_max_conc_proc has been done in
> +        * kgd2kfd_device_init().
> +        */
> +       concurrent_proc_cnt = min(pm->dqm->processes_count,
> +                       kfd->max_proc_per_quantum);
> +
>         packet = (struct pm4_mes_runlist *)buffer;
>
>         memset(buffer, 0, sizeof(struct pm4_mes_runlist));
> @@ -130,6 +155,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
>         packet->bitfields4.chain = chain ? 1 : 0;
>         packet->bitfields4.offload_polling = 0;
>         packet->bitfields4.valid = 1;
> +       packet->bitfields4.process_cnt = concurrent_proc_cnt;
>         packet->ordinal2 = lower_32_bits(ib);
>         packet->bitfields3.ib_base_hi = upper_32_bits(ib);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index a668764..1edab21 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -88,6 +88,12 @@ extern int max_num_of_queues_per_device;
>  /* Kernel module parameter to specify the scheduling policy */
>  extern int sched_policy;
>
> +/*
> + * Kernel module parameter to specify the maximum process
> + * number per HW scheduler
> + */
> +extern int hws_max_conc_proc;
> +
>  extern int cwsr_enable;
>
>  /*
> @@ -214,6 +220,9 @@ struct kfd_dev {
>         /* Debug manager */
>         struct kfd_dbgmgr           *dbgmgr;
>
> +       /* Maximum process number mapped to HW scheduler */
> +       unsigned int max_proc_per_quantum;
> +
>         /* CWSR */
>         bool cwsr_enabled;
>         const void *cwsr_isa;
> --
> 2.7.4
>

This patch is:
Acked-by: Oded Gabbay <oded.gabbay at gmail.com>