Am 2021-08-19 um 9:37 a.m. schrieb David Yat Sin: > When doing a restore on a different node, the gpu_id's on the restore > node may be different. But the user space application will still refer > use the original gpu_id's in the ioctl calls. Adding code to create a > gpu id mapping so that kfd can determine actual gpu_id during the user > ioctl's. > > Signed-off-by: David Yat Sin <david.yatsin@xxxxxxx> > Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx> > --- > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 400 +++++++++++++++++------ > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 +- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 10 + > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 18 + > 4 files changed, 324 insertions(+), 109 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index c8f523d8ab81..90e4d4ce4398 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -294,13 +294,14 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, > return err; > > pr_debug("Looking for gpu id 0x%x\n", args->gpu_id); > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) { > + > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); You need to unlock p->mutex here (i.e. jump to an appropriate error handling label). Regards, Felix > return -EINVAL; > } > - > - mutex_lock(&p->mutex); > + dev = pdd->dev; > > pdd = kfd_bind_process_to_device(dev, p); > if (IS_ERR(pdd)) { > @@ -491,7 +492,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, > struct kfd_process *p, void *data) > { > struct kfd_ioctl_set_memory_policy_args *args = data; > - struct kfd_dev *dev; > int err = 0; > struct kfd_process_device *pdd; > enum cache_policy default_policy, alternate_policy; > @@ -506,13 +506,15 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, > return -EINVAL; > } > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > - return -EINVAL; > - > mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); > + err = -EINVAL; > + goto out; > + } > > - pdd = kfd_bind_process_to_device(dev, p); > + pdd = kfd_bind_process_to_device(pdd->dev, p); > if (IS_ERR(pdd)) { > err = -ESRCH; > goto out; > @@ -525,7 +527,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, > (args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT) > ? cache_policy_coherent : cache_policy_noncoherent; > > - if (!dev->dqm->ops.set_cache_memory_policy(dev->dqm, > + if (!pdd->dev->dqm->ops.set_cache_memory_policy(pdd->dev->dqm, > &pdd->qpd, > default_policy, > alternate_policy, > @@ -543,17 +545,18 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, > struct kfd_process *p, void *data) > { > struct kfd_ioctl_set_trap_handler_args *args = data; > - struct kfd_dev *dev; > int err = 0; > struct kfd_process_device *pdd; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > - return -EINVAL; > - > mutex_lock(&p->mutex); > > - pdd = kfd_bind_process_to_device(dev, p); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + err = -EINVAL; > + goto out; > + } > + > + pdd = kfd_bind_process_to_device(pdd->dev, p); > if (IS_ERR(pdd)) { > err = -ESRCH; > goto out; > @@ -577,16 +580,20 @@ static int kfd_ioctl_dbg_register(struct file *filep, > bool create_ok; > long status = 0; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > - return -EINVAL; > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + status = -EINVAL; > + goto out_unlock_p; > + } > + dev = pdd->dev; > > if (dev->device_info->asic_family == CHIP_CARRIZO) { > pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); > - return -EINVAL; > + status = -EINVAL; > + goto out_unlock_p; > } > > - mutex_lock(&p->mutex); > mutex_lock(kfd_get_dbgmgr_mutex()); > > /* > @@ -596,7 +603,7 @@ static int kfd_ioctl_dbg_register(struct file *filep, > pdd = kfd_bind_process_to_device(dev, p); > if (IS_ERR(pdd)) { > status = PTR_ERR(pdd); > - goto out; > + goto out_unlock_dbg; > } > > if (!dev->dbgmgr) { > @@ -614,8 +621,9 @@ static int kfd_ioctl_dbg_register(struct file *filep, > status = -EINVAL; > } > > -out: > +out_unlock_dbg: > mutex_unlock(kfd_get_dbgmgr_mutex()); > +out_unlock_p: > mutex_unlock(&p->mutex); > > return status; > @@ -625,12 +633,18 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, > struct kfd_process *p, void *data) > { > struct kfd_ioctl_dbg_unregister_args *args = data; > + struct kfd_process_device *pdd; > struct kfd_dev *dev; > long status; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev || !dev->dbgmgr) > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd || !pdd->dev->dbgmgr) { > + mutex_unlock(&p->mutex); > return -EINVAL; > + } > + dev = pdd->dev; > + mutex_unlock(&p->mutex); > > if (dev->device_info->asic_family == CHIP_CARRIZO) { > pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n"); > @@ -664,6 +678,7 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, > { > struct kfd_ioctl_dbg_address_watch_args *args = data; > struct kfd_dev *dev; > + struct kfd_process_device *pdd; > struct dbg_address_watch_info aw_info; > unsigned char *args_buff; > long status; > @@ -673,9 +688,15 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, > > memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + mutex_unlock(&p->mutex); > + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); > return -EINVAL; > + } > + dev = pdd->dev; > + mutex_unlock(&p->mutex); > > if (dev->device_info->asic_family == CHIP_CARRIZO) { > pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); > @@ -764,6 +785,7 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, > { > struct kfd_ioctl_dbg_wave_control_args *args = data; > struct kfd_dev *dev; > + struct kfd_process_device *pdd; > struct dbg_wave_control_info wac_info; > unsigned char *args_buff; > uint32_t computed_buff_size; > @@ -781,9 +803,15 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, > sizeof(wac_info.dbgWave_msg.MemoryVA) + > sizeof(wac_info.trapId); > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + mutex_unlock(&p->mutex); > + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); > return -EINVAL; > + } > + dev = pdd->dev; > + mutex_unlock(&p->mutex); > > if (dev->device_info->asic_family == CHIP_CARRIZO) { > pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); > @@ -847,16 +875,19 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, > struct kfd_process *p, void *data) > { > struct kfd_ioctl_get_clock_counters_args *args = data; > - struct kfd_dev *dev; > + struct kfd_process_device *pdd; > > - dev = kfd_device_by_id(args->gpu_id); > - if (dev) > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (pdd) > /* Reading GPU clock counter from KGD */ > - args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(dev->kgd); > + args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(pdd->dev->kgd); > else > /* Node without GPU resource */ > args->gpu_clock_counter = 0; > > + mutex_unlock(&p->mutex); > + > /* No access to rdtsc. Using raw monotonic time */ > args->cpu_clock_counter = ktime_get_raw_ns(); > args->system_clock_counter = ktime_get_boottime_ns(); > @@ -1070,11 +1101,13 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep, > struct kfd_dev *dev; > long err; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > - return -EINVAL; > - > mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + err = -EINVAL; > + goto bind_process_to_device_fail; > + } > + dev = pdd->dev; > > pdd = kfd_bind_process_to_device(dev, p); > if (IS_ERR(pdd)) { > @@ -1102,15 +1135,20 @@ static int kfd_ioctl_get_tile_config(struct file *filep, > struct kfd_process *p, void *data) > { > struct kfd_ioctl_get_tile_config_args *args = data; > - struct kfd_dev *dev; > + struct kfd_process_device *pdd; > struct tile_config config; > int err = 0; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + mutex_unlock(&p->mutex); > return -EINVAL; > + } > > - amdgpu_amdkfd_get_tile_config(dev->kgd, &config); > + amdgpu_amdkfd_get_tile_config(pdd->dev->kgd, &config); > + > + mutex_unlock(&p->mutex); > > args->gb_addr_config = config.gb_addr_config; > args->num_banks = config.num_banks; > @@ -1145,21 +1183,15 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, > { > struct kfd_ioctl_acquire_vm_args *args = data; > struct kfd_process_device *pdd; > - struct kfd_dev *dev; > struct file *drm_file; > int ret; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > - return -EINVAL; > - > drm_file = fget(args->drm_fd); > if (!drm_file) > return -EINVAL; > > mutex_lock(&p->mutex); > - > - pdd = kfd_get_process_device_data(dev, p); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > if (!pdd) { > ret = -EINVAL; > goto err_unlock; > @@ -1218,19 +1250,23 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, > if (args->size == 0) > return -EINVAL; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > - return -EINVAL; > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + err = -EINVAL; > + goto err_unlock; > + } > + > + dev = pdd->dev; > > if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) && > (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) && > !kfd_dev_is_large_bar(dev)) { > pr_err("Alloc host visible vram on small bar is not allowed\n"); > - return -EINVAL; > + err = -EINVAL; > + goto err_unlock; > } > > - mutex_lock(&p->mutex); > - > pdd = kfd_bind_process_to_device(dev, p); > if (IS_ERR(pdd)) { > err = PTR_ERR(pdd); > @@ -1301,17 +1337,12 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, > struct kfd_ioctl_free_memory_of_gpu_args *args = data; > struct kfd_process_device *pdd; > void *mem; > - struct kfd_dev *dev; > int ret; > uint64_t size = 0; > > - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); > - if (!dev) > - return -EINVAL; > - > mutex_lock(&p->mutex); > > - pdd = kfd_get_process_device_data(dev, p); > + pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle)); > if (!pdd) { > pr_err("Process device data doesn't exist\n"); > ret = -EINVAL; > @@ -1325,7 +1356,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, > goto err_unlock; > } > > - ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, > + ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->kgd, > (struct kgd_mem *)mem, pdd->drm_priv, &size); > > /* If freeing the buffer failed, leave the handle in place for > @@ -1348,15 +1379,11 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, > struct kfd_ioctl_map_memory_to_gpu_args *args = data; > struct kfd_process_device *pdd, *peer_pdd; > void *mem; > - struct kfd_dev *dev, *peer; > + struct kfd_dev *dev; > long err = 0; > int i; > uint32_t *devices_arr = NULL; > > - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); > - if (!dev) > - return -EINVAL; > - > if (!args->n_devices) { > pr_debug("Device IDs array empty\n"); > return -EINVAL; > @@ -1380,6 +1407,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, > } > > mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle)); > + if (!pdd) { > + err = -EINVAL; > + goto get_process_device_data_failed; > + } > + dev = pdd->dev; > > pdd = kfd_bind_process_to_device(dev, p); > if (IS_ERR(pdd)) { > @@ -1395,21 +1428,21 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, > } > > for (i = args->n_success; i < args->n_devices; i++) { > - peer = kfd_device_by_id(devices_arr[i]); > - if (!peer) { > + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); > + if (!peer_pdd) { > pr_debug("Getting device by id failed for 0x%x\n", > devices_arr[i]); > err = -EINVAL; > goto get_mem_obj_from_handle_failed; > } > > - peer_pdd = kfd_bind_process_to_device(peer, p); > + peer_pdd = kfd_bind_process_to_device(peer_pdd->dev, p); > if (IS_ERR(peer_pdd)) { > err = PTR_ERR(peer_pdd); > goto get_mem_obj_from_handle_failed; > } > err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu( > - peer->kgd, (struct kgd_mem *)mem, peer_pdd->drm_priv); > + peer_pdd->dev->kgd, (struct kgd_mem *)mem, peer_pdd->drm_priv); > if (err) { > pr_err("Failed to map to gpu %d/%d\n", > i, args->n_devices); > @@ -1428,12 +1461,10 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, > > /* Flush TLBs after waiting for the page table updates to complete */ > for (i = 0; i < args->n_devices; i++) { > - peer = kfd_device_by_id(devices_arr[i]); > - if (WARN_ON_ONCE(!peer)) > - continue; > - peer_pdd = kfd_get_process_device_data(peer, p); > + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); > if (WARN_ON_ONCE(!peer_pdd)) > continue; > + > kfd_flush_tlb(peer_pdd); > } > > @@ -1441,6 +1472,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, > > return err; > > +get_process_device_data_failed: > bind_process_to_device_failed: > get_mem_obj_from_handle_failed: > map_memory_to_gpu_failed: > @@ -1458,14 +1490,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, > struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; > struct kfd_process_device *pdd, *peer_pdd; > void *mem; > - struct kfd_dev *dev, *peer; > long err = 0; > uint32_t *devices_arr = NULL, i; > > - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); > - if (!dev) > - return -EINVAL; > - > if (!args->n_devices) { > pr_debug("Device IDs array empty\n"); > return -EINVAL; > @@ -1489,8 +1516,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, > } > > mutex_lock(&p->mutex); > - > - pdd = kfd_get_process_device_data(dev, p); > + pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle)); > if (!pdd) { > err = -EINVAL; > goto bind_process_to_device_failed; > @@ -1504,19 +1530,13 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, > } > > for (i = args->n_success; i < args->n_devices; i++) { > - peer = kfd_device_by_id(devices_arr[i]); > - if (!peer) { > - err = -EINVAL; > - goto get_mem_obj_from_handle_failed; > - } > - > - peer_pdd = kfd_get_process_device_data(peer, p); > + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); > if (!peer_pdd) { > - err = -ENODEV; > + err = -EINVAL; > goto get_mem_obj_from_handle_failed; > } > err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( > - peer->kgd, (struct kgd_mem *)mem, peer_pdd->drm_priv); > + peer_pdd->dev->kgd, (struct kgd_mem *)mem, peer_pdd->drm_priv); > if (err) { > pr_err("Failed to unmap from gpu %d/%d\n", > i, args->n_devices); > @@ -1645,23 +1665,26 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, > void *mem; > int r; > > - dev = kfd_device_by_id(args->gpu_id); > - if (!dev) > - return -EINVAL; > + mutex_lock(&p->mutex); > + pdd = kfd_process_device_data_by_id(p, args->gpu_id); > + if (!pdd) { > + r = -EINVAL; > + goto err_unlock; > + } > > dmabuf = dma_buf_get(args->dmabuf_fd); > - if (IS_ERR(dmabuf)) > - return PTR_ERR(dmabuf); > - > - mutex_lock(&p->mutex); > + if (IS_ERR(dmabuf)) { > + r = PTR_ERR(dmabuf); > + goto err_unlock; > + } > > - pdd = kfd_bind_process_to_device(dev, p); > + pdd = kfd_bind_process_to_device(pdd->dev, p); > if (IS_ERR(pdd)) { > r = PTR_ERR(pdd); > goto err_unlock; > } > > - r = amdgpu_amdkfd_gpuvm_import_dmabuf(dev->kgd, dmabuf, > + r = amdgpu_amdkfd_gpuvm_import_dmabuf(pdd->dev->kgd, dmabuf, > args->va_addr, pdd->drm_priv, > (struct kgd_mem **)&mem, &size, > NULL); > @@ -1695,13 +1718,19 @@ static int kfd_ioctl_smi_events(struct file *filep, > struct kfd_process *p, void *data) > { > struct kfd_ioctl_smi_events_args *args = data; > - struct kfd_dev *dev; > + struct kfd_process_device *pdd; > > - dev = kfd_device_by_id(args->gpuid); > - if (!dev) > + mutex_lock(&p->mutex); > + > + pdd = kfd_process_device_data_by_id(p, args->gpuid); > + if (!pdd) { > + mutex_unlock(&p->mutex); > return -EINVAL; > + } > > - return kfd_smi_event_open(dev, &args->anon_fd); > + mutex_unlock(&p->mutex); > + > + return kfd_smi_event_open(pdd->dev, &args->anon_fd); > } > > static int kfd_ioctl_set_xnack_mode(struct file *filep, > @@ -1800,6 +1829,57 @@ static int criu_dump_process(struct kfd_process *p, struct kfd_ioctl_criu_dumper > return ret; > } > > +static int criu_dump_devices(struct kfd_process *p, struct kfd_ioctl_criu_dumper_args *args) > +{ > + struct kfd_criu_device_bucket *device_buckets; > + int ret = 0, i; > + > + if (args->num_objects != p->n_pdds) { > + pr_err("Mismatch with number of devices (current:%d user:%lld)\n", > + p->n_pdds, args->num_objects); > + return -EINVAL; > + } > + > + if (args->objects_size != args->num_objects * > + (sizeof(*device_buckets) + sizeof(struct kfd_criu_device_priv_data))) { > + pr_err("Invalid objects size for devices\n"); > + return -EINVAL; > + } > + > + device_buckets = kvzalloc(args->objects_size, GFP_KERNEL); > + if (!device_buckets) > + return -ENOMEM; > + > + /* Private data for devices it not currently used. To set private data > + * struct kfd_criu_device_priv_data * device_privs = (struct kfd_criu_device_priv_data*) > + * ((uint8_t*)device_buckets + > + * (args->num_objects * (sizeof(*device_buckets)))); > + */ > + > + for (i = 0; i < args->num_objects; i++) { > + struct kfd_process_device *pdd = p->pdds[i]; > + > + device_buckets[i].user_gpu_id = pdd->user_gpu_id; > + device_buckets[i].actual_gpu_id = pdd->dev->id; > + > + /* priv_data does not contain useful information for now and is reserved for > + * future use, so we do not set its contents > + */ > + device_buckets[i].priv_data_offset = i * sizeof(struct kfd_criu_device_priv_data); > + device_buckets[i].priv_data_size = sizeof(struct kfd_criu_device_priv_data); > + } > + > + ret = copy_to_user((void __user *)args->objects, device_buckets, args->objects_size); > + > + if (ret) { > + pr_err("Failed to copy device information to user\n"); > + ret = -EFAULT; > + } > + > + kvfree(device_buckets); > + return ret; > +} > + > uint64_t get_process_num_bos(struct kfd_process *p) > { > uint64_t num_of_bos = 0, i; > @@ -2231,6 +2311,9 @@ static int kfd_ioctl_criu_dumper(struct file *filep, > case KFD_CRIU_OBJECT_TYPE_PROCESS: > ret = criu_dump_process(p, args); > break; > + case KFD_CRIU_OBJECT_TYPE_DEVICE: > + ret = criu_dump_devices(p, args); > + break; > case KFD_CRIU_OBJECT_TYPE_BO: > ret = criu_dump_bos(p, args); > break; > @@ -2240,7 +2323,6 @@ static int kfd_ioctl_criu_dumper(struct file *filep, > case KFD_CRIU_OBJECT_TYPE_EVENT: > ret = criu_dump_events(p, args); > break; > - case KFD_CRIU_OBJECT_TYPE_DEVICE: > case KFD_CRIU_OBJECT_TYPE_SVM_RANGE: > default: > pr_err("Unsupported object type:%d\n", args->type); > @@ -2301,6 +2383,102 @@ static int criu_restore_process(struct kfd_process *p, struct kfd_ioctl_criu_res > return ret; > } > > +static int criu_restore_devices(struct kfd_process *p, struct kfd_ioctl_criu_restorer_args *args) > +{ > + int ret = 0, i; > + uint8_t *objects; > + struct kfd_criu_device_bucket *device_buckets; > + > + if (args->num_objects != p->n_pdds) > + return -EINVAL; > + > + if (args->objects_size != args->num_objects * > + (sizeof(*device_buckets) + sizeof(struct kfd_criu_device_priv_data))) { > + pr_err("Invalid objects size for devices\n"); > + return -EINVAL; > + } > + > + objects = kmalloc(args->objects_size, GFP_KERNEL); > + if (!objects) > + return -ENOMEM; > + > + ret = copy_from_user(objects, (void __user *)args->objects, args->objects_size); > + if (ret) { > + pr_err("Failed to copy devices information from user\n"); > + ret = -EFAULT; > + goto exit; > + } > + > + device_buckets = (struct kfd_criu_device_bucket *) objects; > + > + for (i = 0; i < args->num_objects; i++) { > + struct kfd_dev *dev; > + struct kfd_process_device *pdd; > + struct file *drm_file; > + > + /* device private data is not currently used. To access device private data: > + * uint8_t *private_datas = objects + > + * (args->num_objects * sizeof(*device_buckets)); > + * > + * struct kfd_criu_device_priv_data *device_priv = > + * (struct kfd_criu_device_priv_data*) > + * (private_datas + device_buckets[i].priv_data_offset); > + */ > + > + dev = kfd_device_by_id(device_buckets[i].actual_gpu_id); > + if (!dev) { > + pr_err("Failed to find device with gpu_id = %x\n", > + device_buckets[i].actual_gpu_id); > + ret = -EINVAL; > + goto exit; > + } > + > + pdd = kfd_get_process_device_data(dev, p); > + if (!pdd) { > + pr_err("Failed to get pdd for gpu_id = %x\n", > + device_buckets[i].actual_gpu_id); > + ret = -EINVAL; > + goto exit; > + } > + pdd->user_gpu_id = device_buckets[i].user_gpu_id; > + > + drm_file = fget(device_buckets[i].drm_fd); > + if (!drm_file) { > + pr_err("Invalid render node file descriptor sent from plugin (%d)\n", > + device_buckets[i].drm_fd); > + ret = -EINVAL; > + goto exit; > + } > + > + if (pdd->drm_file) { > + ret = -EINVAL; > + goto exit; > + } > + > + /* create the vm using render nodes for kfd pdd */ > + if (kfd_process_device_init_vm(pdd, drm_file)) { > + pr_err("could not init vm for given pdd\n"); > + /* On success, the PDD keeps the drm_file reference */ > + fput(drm_file); > + ret = -EINVAL; > + goto exit; > + } > + /* > + * pdd now already has the vm bound to render node so below api won't create a new > + * exclusive kfd mapping but use existing one with renderDXXX but is still needed > + * for iommu v2 binding and runtime pm. > + */ > + pdd = kfd_bind_process_to_device(dev, p); > + if (IS_ERR(pdd)) { > + ret = PTR_ERR(pdd); > + goto exit; > + } > + } > +exit: > + kvfree(objects); > + return ret; > +} > + > static int criu_restore_bos(struct kfd_process *p, struct kfd_ioctl_criu_restorer_args *args) > { > uint8_t *objects, *private_data; > @@ -2719,6 +2897,9 @@ static int kfd_ioctl_criu_restorer(struct file *filep, > case KFD_CRIU_OBJECT_TYPE_PROCESS: > ret = criu_restore_process(p, args); > break; > + case KFD_CRIU_OBJECT_TYPE_DEVICE: > + ret = criu_restore_devices(p, args); > + break; > case KFD_CRIU_OBJECT_TYPE_BO: > ret = criu_restore_bos(p, args); > break; > @@ -2728,7 +2909,6 @@ static int kfd_ioctl_criu_restorer(struct file *filep, > case KFD_CRIU_OBJECT_TYPE_EVENT: > ret = criu_restore_events(filep, p, args); > break; > - case KFD_CRIU_OBJECT_TYPE_DEVICE: > case KFD_CRIU_OBJECT_TYPE_SVM_RANGE: > default: > pr_err("Unsupported object type:%d\n", args->type); > @@ -2819,6 +2999,11 @@ static int kfd_ioctl_criu_process_info(struct file *filep, > > args->process_priv_data_size = sizeof(struct kfd_criu_process_priv_data); > > + args->total_devices = p->n_pdds; > + /* devices_priv_data_size does not contain any useful information for now */ > + args->devices_priv_data_size = args->total_devices * > + sizeof(struct kfd_criu_device_priv_data); > + > args->total_bos = get_process_num_bos(p); > args->bos_priv_data_size = args->total_bos * sizeof(struct kfd_criu_bo_priv_data); > > @@ -2832,7 +3017,8 @@ static int kfd_ioctl_criu_process_info(struct file *filep, > args->total_events = kfd_get_num_events(p); > args->events_priv_data_size = args->total_events * sizeof(struct kfd_criu_event_priv_data); > > - dev_dbg(kfd_device, "Num of bos:%llu queues:%u events:%u\n", > + dev_dbg(kfd_device, "Num of devices:%u bos:%llu queues:%u events:%u\n", > + args->total_devices, > args->total_bos, > args->total_queues, > args->total_events); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > index 18362478e351..5e9067b70908 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > @@ -343,11 +343,12 @@ int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset) > return -EINVAL; > } > > - kfd = kfd_device_by_id(GET_GPU_ID(event_page_offset)); > - if (!kfd) { > + pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(event_page_offset)); > + if (!pdd) { > pr_err("Getting device by id failed in %s\n", __func__); > return -EINVAL; > } > + kfd = pdd->dev; > > pdd = kfd_bind_process_to_device(kfd, p); > if (IS_ERR(pdd)) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index bf10a5305ef7..1912df8d9101 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -759,6 +759,13 @@ struct kfd_process_device { > * number of CU's a device has along with number of other competing processes > */ > struct attribute attr_cu_occupancy; > + > + /* > + * If this process has been checkpointed before, then the user > + * application will use the original gpu_id on the > + * checkpointed node to refer to this device. > + */ > + uint32_t user_gpu_id; > }; > > #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) > @@ -914,6 +921,9 @@ int kfd_process_restore_queues(struct kfd_process *p); > void kfd_suspend_all_processes(void); > int kfd_resume_all_processes(void); > > +struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *process, > + uint32_t gpu_id); > + > int kfd_process_device_init_vm(struct kfd_process_device *pdd, > struct file *drm_file); > struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index e4cb2f778590..a23f2162eb8b 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1425,6 +1425,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, > pdd->runtime_inuse = false; > pdd->vram_usage = 0; > pdd->sdma_past_activity_counter = 0; > + pdd->user_gpu_id = dev->id; > atomic64_set(&pdd->evict_duration_counter, 0); > p->pdds[p->n_pdds++] = pdd; > > @@ -1898,6 +1899,23 @@ void kfd_flush_tlb(struct kfd_process_device *pdd) > } > } > > +struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id) > +{ > + int i; > + > + if (gpu_id) { > + for (i = 0; i < p->n_pdds; i++) { > + struct kfd_process_device *pdd = p->pdds[i]; > + > + if (pdd->user_gpu_id == gpu_id) > + return pdd; > + } > + > + WARN_ONCE(1, "Failed to find mapping for gpu = 0x%x\n", gpu_id); > + } > + return NULL; > +} > + > #if defined(CONFIG_DEBUG_FS) > > int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)