The kfd CRIU code takes up about a thousand lines in the kfd_chardev file; move it to its own file. No functional change intended. Signed-off-by: David Francis <David.Francis@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 972 +--------------------- drivers/gpu/drm/amd/amdkfd/kfd_criu.c | 989 +++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_criu.h | 50 ++ 4 files changed, 1046 insertions(+), 966 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 0d3d8972240d..e06af4073ac5 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -32,6 +32,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_flat_memory.o \ $(AMDKFD_PATH)/kfd_process.o \ $(AMDKFD_PATH)/kfd_queue.o \ + $(AMDKFD_PATH)/kfd_criu.o \ $(AMDKFD_PATH)/kfd_mqd_manager.o \ $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6b713fb0b818..e6e44a199a93 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -45,6 +45,7 @@ #include "kfd_smi_events.h" #include "amdgpu_dma_buf.h" #include "kfd_debug.h" +#include "kfd_criu.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1751,967 +1752,6 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) } #endif -static int criu_checkpoint_process(struct kfd_process *p, - uint8_t __user *user_priv_data, - uint64_t *priv_offset) -{ - struct kfd_criu_process_priv_data process_priv; - int ret; - - memset(&process_priv, 0, sizeof(process_priv)); - - process_priv.version = KFD_CRIU_PRIV_VERSION; - /* For CR, we don't consider negative xnack mode which is used for - * querying without changing it, here 0 simply means disabled and 1 - * means enabled so retry for finding a valid PTE. - */ - process_priv.xnack_mode = p->xnack_enabled ? 1 : 0; - - ret = copy_to_user(user_priv_data + *priv_offset, - &process_priv, sizeof(process_priv)); - - if (ret) { - pr_err("Failed to copy process information to user\n"); - ret = -EFAULT; - } - - *priv_offset += sizeof(process_priv); - return ret; -} - -static int criu_checkpoint_devices(struct kfd_process *p, - uint32_t num_devices, - uint8_t __user *user_addr, - uint8_t __user *user_priv_data, - uint64_t *priv_offset) -{ - struct kfd_criu_device_priv_data *device_priv = NULL; - struct kfd_criu_device_bucket *device_buckets = NULL; - int ret = 0, i; - - device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL); - if (!device_buckets) { - ret = -ENOMEM; - goto exit; - } - - device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL); - if (!device_priv) { - ret = -ENOMEM; - goto exit; - } - - for (i = 0; i < num_devices; i++) { - struct kfd_process_device *pdd = p->pdds[i]; - - device_buckets[i].user_gpu_id = pdd->user_gpu_id; - device_buckets[i].actual_gpu_id = pdd->dev->id; - - /* - * priv_data does not contain useful information for now and is reserved for - * future use, so we do not set its contents. - */ - } - - ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets)); - if (ret) { - pr_err("Failed to copy device information to user\n"); - ret = -EFAULT; - goto exit; - } - - ret = copy_to_user(user_priv_data + *priv_offset, - device_priv, - num_devices * sizeof(*device_priv)); - if (ret) { - pr_err("Failed to copy device information to user\n"); - ret = -EFAULT; - } - *priv_offset += num_devices * sizeof(*device_priv); - -exit: - kvfree(device_buckets); - kvfree(device_priv); - return ret; -} - -static uint32_t get_process_num_bos(struct kfd_process *p) -{ - uint32_t num_of_bos = 0; - int i; - - /* Run over all PDDs of the process */ - for (i = 0; i < p->n_pdds; i++) { - struct kfd_process_device *pdd = p->pdds[i]; - void *mem; - int id; - - idr_for_each_entry(&pdd->alloc_idr, mem, id) { - struct kgd_mem *kgd_mem = (struct kgd_mem *)mem; - - if (!kgd_mem->va || kgd_mem->va > pdd->gpuvm_base) - num_of_bos++; - } - } - return num_of_bos; -} - -static int criu_get_prime_handle(struct kgd_mem *mem, - int flags, u32 *shared_fd) -{ - struct dma_buf *dmabuf; - int ret; - - ret = amdgpu_amdkfd_gpuvm_export_dmabuf(mem, &dmabuf); - if (ret) { - pr_err("dmabuf export failed for the BO\n"); - return ret; - } - - ret = dma_buf_fd(dmabuf, flags); - if (ret < 0) { - pr_err("dmabuf create fd failed, ret:%d\n", ret); - goto out_free_dmabuf; - } - - *shared_fd = ret; - return 0; - -out_free_dmabuf: - dma_buf_put(dmabuf); - return ret; -} - -static int criu_checkpoint_bos(struct kfd_process *p, - uint32_t num_bos, - uint8_t __user *user_bos, - uint8_t __user *user_priv_data, - uint64_t *priv_offset) -{ - struct kfd_criu_bo_bucket *bo_buckets; - struct kfd_criu_bo_priv_data *bo_privs; - int ret = 0, pdd_index, bo_index = 0, id; - void *mem; - - bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL); - if (!bo_buckets) - return -ENOMEM; - - bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL); - if (!bo_privs) { - ret = -ENOMEM; - goto exit; - } - - for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) { - struct kfd_process_device *pdd = p->pdds[pdd_index]; - struct amdgpu_bo *dumper_bo; - struct kgd_mem *kgd_mem; - - idr_for_each_entry(&pdd->alloc_idr, mem, id) { - struct kfd_criu_bo_bucket *bo_bucket; - struct kfd_criu_bo_priv_data *bo_priv; - int i, dev_idx = 0; - - if (!mem) { - ret = -ENOMEM; - goto exit; - } - - kgd_mem = (struct kgd_mem *)mem; - dumper_bo = kgd_mem->bo; - - /* Skip checkpointing BOs that are used for Trap handler - * code and state. Currently, these BOs have a VA that - * is less GPUVM Base - */ - if (kgd_mem->va && kgd_mem->va <= pdd->gpuvm_base) - continue; - - bo_bucket = &bo_buckets[bo_index]; - bo_priv = &bo_privs[bo_index]; - - bo_bucket->gpu_id = pdd->user_gpu_id; - bo_bucket->addr = (uint64_t)kgd_mem->va; - bo_bucket->size = amdgpu_bo_size(dumper_bo); - bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags; - bo_priv->idr_handle = id; - - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { - ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo, - &bo_priv->user_addr); - if (ret) { - pr_err("Failed to obtain user address for user-pointer bo\n"); - goto exit; - } - } - if (bo_bucket->alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { - ret = criu_get_prime_handle(kgd_mem, - bo_bucket->alloc_flags & - KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0, - &bo_bucket->dmabuf_fd); - if (ret) - goto exit; - } else { - bo_bucket->dmabuf_fd = KFD_INVALID_FD; - } - - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) - bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL | - KFD_MMAP_GPU_ID(pdd->dev->id); - else if (bo_bucket->alloc_flags & - KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) - bo_bucket->offset = KFD_MMAP_TYPE_MMIO | - KFD_MMAP_GPU_ID(pdd->dev->id); - else - bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo); - - for (i = 0; i < p->n_pdds; i++) { - if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem)) - bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id; - } - - pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n" - "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x", - bo_bucket->size, - bo_bucket->addr, - bo_bucket->offset, - bo_bucket->gpu_id, - bo_bucket->alloc_flags, - bo_priv->idr_handle); - bo_index++; - } - } - - ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets)); - if (ret) { - pr_err("Failed to copy BO information to user\n"); - ret = -EFAULT; - goto exit; - } - - ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs)); - if (ret) { - pr_err("Failed to copy BO priv information to user\n"); - ret = -EFAULT; - goto exit; - } - - *priv_offset += num_bos * sizeof(*bo_privs); - -exit: - while (ret && bo_index--) { - if (bo_buckets[bo_index].alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) - close_fd(bo_buckets[bo_index].dmabuf_fd); - } - - kvfree(bo_buckets); - kvfree(bo_privs); - return ret; -} - -static int criu_get_process_object_info(struct kfd_process *p, - uint32_t *num_devices, - uint32_t *num_bos, - uint32_t *num_objects, - uint64_t *objs_priv_size) -{ - uint64_t queues_priv_data_size, svm_priv_data_size, priv_size; - uint32_t num_queues, num_events, num_svm_ranges; - int ret; - - *num_devices = p->n_pdds; - *num_bos = get_process_num_bos(p); - - ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size); - if (ret) - return ret; - - num_events = kfd_get_num_events(p); - - ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size); - if (ret) - return ret; - - *num_objects = num_queues + num_events + num_svm_ranges; - - if (objs_priv_size) { - priv_size = sizeof(struct kfd_criu_process_priv_data); - priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data); - priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data); - priv_size += queues_priv_data_size; - priv_size += num_events * sizeof(struct kfd_criu_event_priv_data); - priv_size += svm_priv_data_size; - *objs_priv_size = priv_size; - } - return 0; -} - -static int criu_checkpoint(struct file *filep, - struct kfd_process *p, - struct kfd_ioctl_criu_args *args) -{ - int ret; - uint32_t num_devices, num_bos, num_objects; - uint64_t priv_size, priv_offset = 0, bo_priv_offset; - - if (!args->devices || !args->bos || !args->priv_data) - return -EINVAL; - - mutex_lock(&p->mutex); - - if (!p->n_pdds) { - pr_err("No pdd for given process\n"); - ret = -ENODEV; - goto exit_unlock; - } - - /* Confirm all process queues are evicted */ - if (!p->queues_paused) { - pr_err("Cannot dump process when queues are not in evicted state\n"); - /* CRIU plugin did not call op PROCESS_INFO before checkpointing */ - ret = -EINVAL; - goto exit_unlock; - } - - ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size); - if (ret) - goto exit_unlock; - - if (num_devices != args->num_devices || - num_bos != args->num_bos || - num_objects != args->num_objects || - priv_size != args->priv_data_size) { - - ret = -EINVAL; - goto exit_unlock; - } - - /* each function will store private data inside priv_data and adjust priv_offset */ - ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset); - if (ret) - goto exit_unlock; - - ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices, - (uint8_t __user *)args->priv_data, &priv_offset); - if (ret) - goto exit_unlock; - - /* Leave room for BOs in the private data. They need to be restored - * before events, but we checkpoint them last to simplify the error - * handling. - */ - bo_priv_offset = priv_offset; - priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data); - - if (num_objects) { - ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data, - &priv_offset); - if (ret) - goto exit_unlock; - - ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data, - &priv_offset); - if (ret) - goto exit_unlock; - - ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset); - if (ret) - goto exit_unlock; - } - - /* This must be the last thing in this function that can fail. - * Otherwise we leak dmabuf file descriptors. - */ - ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos, - (uint8_t __user *)args->priv_data, &bo_priv_offset); - -exit_unlock: - mutex_unlock(&p->mutex); - if (ret) - pr_err("Failed to dump CRIU ret:%d\n", ret); - else - pr_debug("CRIU dump ret:%d\n", ret); - - return ret; -} - -static int criu_restore_process(struct kfd_process *p, - struct kfd_ioctl_criu_args *args, - uint64_t *priv_offset, - uint64_t max_priv_data_size) -{ - int ret = 0; - struct kfd_criu_process_priv_data process_priv; - - if (*priv_offset + sizeof(process_priv) > max_priv_data_size) - return -EINVAL; - - ret = copy_from_user(&process_priv, - (void __user *)(args->priv_data + *priv_offset), - sizeof(process_priv)); - if (ret) { - pr_err("Failed to copy process private information from user\n"); - ret = -EFAULT; - goto exit; - } - *priv_offset += sizeof(process_priv); - - if (process_priv.version != KFD_CRIU_PRIV_VERSION) { - pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n", - process_priv.version, KFD_CRIU_PRIV_VERSION); - return -EINVAL; - } - - pr_debug("Setting XNACK mode\n"); - if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) { - pr_err("xnack mode cannot be set\n"); - ret = -EPERM; - goto exit; - } else { - pr_debug("set xnack mode: %d\n", process_priv.xnack_mode); - p->xnack_enabled = process_priv.xnack_mode; - } - -exit: - return ret; -} - -static int criu_restore_devices(struct kfd_process *p, - struct kfd_ioctl_criu_args *args, - uint64_t *priv_offset, - uint64_t max_priv_data_size) -{ - struct kfd_criu_device_bucket *device_buckets; - struct kfd_criu_device_priv_data *device_privs; - int ret = 0; - uint32_t i; - - if (args->num_devices != p->n_pdds) - return -EINVAL; - - if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size) - return -EINVAL; - - device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL); - if (!device_buckets) - return -ENOMEM; - - ret = copy_from_user(device_buckets, (void __user *)args->devices, - args->num_devices * sizeof(*device_buckets)); - if (ret) { - pr_err("Failed to copy devices buckets from user\n"); - ret = -EFAULT; - goto exit; - } - - for (i = 0; i < args->num_devices; i++) { - struct kfd_node *dev; - struct kfd_process_device *pdd; - struct file *drm_file; - - /* device private data is not currently used */ - - if (!device_buckets[i].user_gpu_id) { - pr_err("Invalid user gpu_id\n"); - ret = -EINVAL; - goto exit; - } - - dev = kfd_device_by_id(device_buckets[i].actual_gpu_id); - if (!dev) { - pr_err("Failed to find device with gpu_id = %x\n", - device_buckets[i].actual_gpu_id); - ret = -EINVAL; - goto exit; - } - - pdd = kfd_get_process_device_data(dev, p); - if (!pdd) { - pr_err("Failed to get pdd for gpu_id = %x\n", - device_buckets[i].actual_gpu_id); - ret = -EINVAL; - goto exit; - } - pdd->user_gpu_id = device_buckets[i].user_gpu_id; - - drm_file = fget(device_buckets[i].drm_fd); - if (!drm_file) { - pr_err("Invalid render node file descriptor sent from plugin (%d)\n", - device_buckets[i].drm_fd); - ret = -EINVAL; - goto exit; - } - - if (pdd->drm_file) { - ret = -EINVAL; - goto exit; - } - - /* create the vm using render nodes for kfd pdd */ - if (kfd_process_device_init_vm(pdd, drm_file)) { - pr_err("could not init vm for given pdd\n"); - /* On success, the PDD keeps the drm_file reference */ - fput(drm_file); - ret = -EINVAL; - goto exit; - } - /* - * pdd now already has the vm bound to render node so below api won't create a new - * exclusive kfd mapping but use existing one with renderDXXX but is still needed - * for iommu v2 binding and runtime pm. - */ - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - ret = PTR_ERR(pdd); - goto exit; - } - - if (!pdd->qpd.proc_doorbells) { - ret = kfd_alloc_process_doorbells(dev->kfd, pdd); - if (ret) - goto exit; - } - } - - /* - * We are not copying device private data from user as we are not using the data for now, - * but we still adjust for its private data. - */ - *priv_offset += args->num_devices * sizeof(*device_privs); - -exit: - kfree(device_buckets); - return ret; -} - -static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, - struct kfd_criu_bo_bucket *bo_bucket, - struct kfd_criu_bo_priv_data *bo_priv, - struct kgd_mem **kgd_mem) -{ - int idr_handle; - int ret; - const bool criu_resume = true; - u64 offset; - - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { - if (bo_bucket->size != - kfd_doorbell_process_slice(pdd->dev->kfd)) - return -EINVAL; - - offset = kfd_get_process_doorbells(pdd); - if (!offset) - return -ENOMEM; - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { - /* MMIO BOs need remapped bus address */ - if (bo_bucket->size != PAGE_SIZE) { - pr_err("Invalid page size\n"); - return -EINVAL; - } - offset = pdd->dev->adev->rmmio_remap.bus_addr; - if (!offset) { - pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n"); - return -ENOMEM; - } - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { - offset = bo_priv->user_addr; - } - /* Create the BO */ - ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr, - bo_bucket->size, pdd->drm_priv, kgd_mem, - &offset, bo_bucket->alloc_flags, criu_resume); - if (ret) { - pr_err("Could not create the BO\n"); - return ret; - } - pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n", - bo_bucket->size, bo_bucket->addr, offset); - - /* Restore previous IDR handle */ - pr_debug("Restoring old IDR handle for the BO"); - idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle, - bo_priv->idr_handle + 1, GFP_KERNEL); - - if (idr_handle < 0) { - pr_err("Could not allocate idr\n"); - amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv, - NULL); - return -ENOMEM; - } - - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) - bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id); - if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { - bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id); - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) { - bo_bucket->restored_offset = offset; - } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { - bo_bucket->restored_offset = offset; - /* Update the VRAM usage count */ - WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); - } - return 0; -} - -static int criu_restore_bo(struct kfd_process *p, - struct kfd_criu_bo_bucket *bo_bucket, - struct kfd_criu_bo_priv_data *bo_priv) -{ - struct kfd_process_device *pdd; - struct kgd_mem *kgd_mem; - int ret; - int j; - - pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n", - bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags, - bo_priv->idr_handle); - - pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id); - if (!pdd) { - pr_err("Failed to get pdd\n"); - return -ENODEV; - } - - ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem); - if (ret) - return ret; - - /* now map these BOs to GPU/s */ - for (j = 0; j < p->n_pdds; j++) { - struct kfd_node *peer; - struct kfd_process_device *peer_pdd; - - if (!bo_priv->mapped_gpuids[j]) - break; - - peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]); - if (!peer_pdd) - return -EINVAL; - - peer = peer_pdd->dev; - - peer_pdd = kfd_bind_process_to_device(peer, p); - if (IS_ERR(peer_pdd)) - return PTR_ERR(peer_pdd); - - ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem, - peer_pdd->drm_priv); - if (ret) { - pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds); - return ret; - } - } - - pr_debug("map memory was successful for the BO\n"); - /* create the dmabuf object and export the bo */ - if (bo_bucket->alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { - ret = criu_get_prime_handle(kgd_mem, DRM_RDWR, - &bo_bucket->dmabuf_fd); - if (ret) - return ret; - } else { - bo_bucket->dmabuf_fd = KFD_INVALID_FD; - } - - return 0; -} - -static int criu_restore_bos(struct kfd_process *p, - struct kfd_ioctl_criu_args *args, - uint64_t *priv_offset, - uint64_t max_priv_data_size) -{ - struct kfd_criu_bo_bucket *bo_buckets = NULL; - struct kfd_criu_bo_priv_data *bo_privs = NULL; - int ret = 0; - uint32_t i = 0; - - if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size) - return -EINVAL; - - /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */ - amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info); - - bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL); - if (!bo_buckets) - return -ENOMEM; - - ret = copy_from_user(bo_buckets, (void __user *)args->bos, - args->num_bos * sizeof(*bo_buckets)); - if (ret) { - pr_err("Failed to copy BOs information from user\n"); - ret = -EFAULT; - goto exit; - } - - bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL); - if (!bo_privs) { - ret = -ENOMEM; - goto exit; - } - - ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset, - args->num_bos * sizeof(*bo_privs)); - if (ret) { - pr_err("Failed to copy BOs information from user\n"); - ret = -EFAULT; - goto exit; - } - *priv_offset += args->num_bos * sizeof(*bo_privs); - - /* Create and map new BOs */ - for (; i < args->num_bos; i++) { - ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]); - if (ret) { - pr_debug("Failed to restore BO[%d] ret%d\n", i, ret); - goto exit; - } - } /* done */ - - /* Copy only the buckets back so user can read bo_buckets[N].restored_offset */ - ret = copy_to_user((void __user *)args->bos, - bo_buckets, - (args->num_bos * sizeof(*bo_buckets))); - if (ret) - ret = -EFAULT; - -exit: - while (ret && i--) { - if (bo_buckets[i].alloc_flags - & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) - close_fd(bo_buckets[i].dmabuf_fd); - } - kvfree(bo_buckets); - kvfree(bo_privs); - return ret; -} - -static int criu_restore_objects(struct file *filep, - struct kfd_process *p, - struct kfd_ioctl_criu_args *args, - uint64_t *priv_offset, - uint64_t max_priv_data_size) -{ - int ret = 0; - uint32_t i; - - BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type)); - BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type)); - BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type)); - - for (i = 0; i < args->num_objects; i++) { - uint32_t object_type; - - if (*priv_offset + sizeof(object_type) > max_priv_data_size) { - pr_err("Invalid private data size\n"); - return -EINVAL; - } - - ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset)); - if (ret) { - pr_err("Failed to copy private information from user\n"); - goto exit; - } - - switch (object_type) { - case KFD_CRIU_OBJECT_TYPE_QUEUE: - ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data, - priv_offset, max_priv_data_size); - if (ret) - goto exit; - break; - case KFD_CRIU_OBJECT_TYPE_EVENT: - ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data, - priv_offset, max_priv_data_size); - if (ret) - goto exit; - break; - case KFD_CRIU_OBJECT_TYPE_SVM_RANGE: - ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data, - priv_offset, max_priv_data_size); - if (ret) - goto exit; - break; - default: - pr_err("Invalid object type:%u at index:%d\n", object_type, i); - ret = -EINVAL; - goto exit; - } - } -exit: - return ret; -} - -static int criu_restore(struct file *filep, - struct kfd_process *p, - struct kfd_ioctl_criu_args *args) -{ - uint64_t priv_offset = 0; - int ret = 0; - - pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n", - args->num_devices, args->num_bos, args->num_objects, args->priv_data_size); - - if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size || - !args->num_devices || !args->num_bos) - return -EINVAL; - - mutex_lock(&p->mutex); - - /* - * Set the process to evicted state to avoid running any new queues before all the memory - * mappings are ready. - */ - ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE); - if (ret) - goto exit_unlock; - - /* Each function will adjust priv_offset based on how many bytes they consumed */ - ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size); - if (ret) - goto exit_unlock; - - ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size); - if (ret) - goto exit_unlock; - - ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size); - if (ret) - goto exit_unlock; - - ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size); - if (ret) - goto exit_unlock; - - if (priv_offset != args->priv_data_size) { - pr_err("Invalid private data size\n"); - ret = -EINVAL; - } - -exit_unlock: - mutex_unlock(&p->mutex); - if (ret) - pr_err("Failed to restore CRIU ret:%d\n", ret); - else - pr_debug("CRIU restore successful\n"); - - return ret; -} - -static int criu_unpause(struct file *filep, - struct kfd_process *p, - struct kfd_ioctl_criu_args *args) -{ - int ret; - - mutex_lock(&p->mutex); - - if (!p->queues_paused) { - mutex_unlock(&p->mutex); - return -EINVAL; - } - - ret = kfd_process_restore_queues(p); - if (ret) - pr_err("Failed to unpause queues ret:%d\n", ret); - else - p->queues_paused = false; - - mutex_unlock(&p->mutex); - - return ret; -} - -static int criu_resume(struct file *filep, - struct kfd_process *p, - struct kfd_ioctl_criu_args *args) -{ - struct kfd_process *target = NULL; - struct pid *pid = NULL; - int ret = 0; - - pr_debug("Inside %s, target pid for criu restore: %d\n", __func__, - args->pid); - - pid = find_get_pid(args->pid); - if (!pid) { - pr_err("Cannot find pid info for %i\n", args->pid); - return -ESRCH; - } - - pr_debug("calling kfd_lookup_process_by_pid\n"); - target = kfd_lookup_process_by_pid(pid); - - put_pid(pid); - - if (!target) { - pr_debug("Cannot find process info for %i\n", args->pid); - return -ESRCH; - } - - mutex_lock(&target->mutex); - ret = kfd_criu_resume_svm(target); - if (ret) { - pr_err("kfd_criu_resume_svm failed for %i\n", args->pid); - goto exit; - } - - ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info); - if (ret) - pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid); - -exit: - mutex_unlock(&target->mutex); - - kfd_unref_process(target); - return ret; -} - -static int criu_process_info(struct file *filep, - struct kfd_process *p, - struct kfd_ioctl_criu_args *args) -{ - int ret = 0; - - mutex_lock(&p->mutex); - - if (!p->n_pdds) { - pr_err("No pdd for given process\n"); - ret = -ENODEV; - goto err_unlock; - } - - ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT); - if (ret) - goto err_unlock; - - p->queues_paused = true; - - args->pid = task_pid_nr_ns(p->lead_thread, - task_active_pid_ns(p->lead_thread)); - - ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos, - &args->num_objects, &args->priv_data_size); - if (ret) - goto err_unlock; - - dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n", - args->num_devices, args->num_bos, args->num_objects, - args->priv_data_size); - -err_unlock: - if (ret) { - kfd_process_restore_queues(p); - p->queues_paused = false; - } - mutex_unlock(&p->mutex); - return ret; -} - static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_criu_args *args = data; @@ -2720,19 +1760,19 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data) dev_dbg(kfd_device, "CRIU operation: %d\n", args->op); switch (args->op) { case KFD_CRIU_OP_PROCESS_INFO: - ret = criu_process_info(filep, p, args); + ret = kfd_criu_process_info(filep, p, args); break; case KFD_CRIU_OP_CHECKPOINT: - ret = criu_checkpoint(filep, p, args); + ret = kfd_criu_checkpoint(filep, p, args); break; case KFD_CRIU_OP_UNPAUSE: - ret = criu_unpause(filep, p, args); + ret = kfd_criu_unpause(filep, p, args); break; case KFD_CRIU_OP_RESTORE: - ret = criu_restore(filep, p, args); + ret = kfd_criu_restore(filep, p, args); break; case KFD_CRIU_OP_RESUME: - ret = criu_resume(filep, p, args); + ret = kfd_criu_resume(filep, p, args); break; default: dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_criu.c b/drivers/gpu/drm/amd/amdkfd/kfd_criu.c new file mode 100644 index 000000000000..72a9b358a642 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_criu.c @@ -0,0 +1,989 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/dma-buf.h> +#include <linux/fdtable.h> + +#include "kfd_criu.h" +#include "kfd_svm.h" + +static int criu_checkpoint_process(struct kfd_process *p, + uint8_t __user *user_priv_data, + uint64_t *priv_offset) +{ + struct kfd_criu_process_priv_data process_priv; + int ret; + + memset(&process_priv, 0, sizeof(process_priv)); + + process_priv.version = KFD_CRIU_PRIV_VERSION; + /* For CR, we don't consider negative xnack mode which is used for + * querying without changing it, here 0 simply means disabled and 1 + * means enabled so retry for finding a valid PTE. + */ + process_priv.xnack_mode = p->xnack_enabled ? 1 : 0; + + ret = copy_to_user(user_priv_data + *priv_offset, + &process_priv, sizeof(process_priv)); + + if (ret) { + pr_err("Failed to copy process information to user\n"); + ret = -EFAULT; + } + + *priv_offset += sizeof(process_priv); + return ret; +} + +static int criu_checkpoint_devices(struct kfd_process *p, + uint32_t num_devices, + uint8_t __user *user_addr, + uint8_t __user *user_priv_data, + uint64_t *priv_offset) +{ + struct kfd_criu_device_priv_data *device_priv = NULL; + struct kfd_criu_device_bucket *device_buckets = NULL; + int ret = 0, i; + + device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL); + if (!device_buckets) { + ret = -ENOMEM; + goto exit; + } + + device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL); + if (!device_priv) { + ret = -ENOMEM; + goto exit; + } + + for (i = 0; i < num_devices; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + + device_buckets[i].user_gpu_id = pdd->user_gpu_id; + device_buckets[i].actual_gpu_id = pdd->dev->id; + + /* + * priv_data does not contain useful information for now and is reserved for + * future use, so we do not set its contents. + */ + } + + ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets)); + if (ret) { + pr_err("Failed to copy device information to user\n"); + ret = -EFAULT; + goto exit; + } + + ret = copy_to_user(user_priv_data + *priv_offset, + device_priv, + num_devices * sizeof(*device_priv)); + if (ret) { + pr_err("Failed to copy device information to user\n"); + ret = -EFAULT; + } + *priv_offset += num_devices * sizeof(*device_priv); + +exit: + kvfree(device_buckets); + kvfree(device_priv); + return ret; +} + +static uint32_t get_process_num_bos(struct kfd_process *p) +{ + uint32_t num_of_bos = 0; + int i; + + /* Run over all PDDs of the process */ + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + void *mem; + int id; + + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + struct kgd_mem *kgd_mem = (struct kgd_mem *)mem; + + if (!kgd_mem->va || kgd_mem->va > pdd->gpuvm_base) + num_of_bos++; + } + } + return num_of_bos; +} + +static int criu_get_prime_handle(struct kgd_mem *mem, + int flags, u32 *shared_fd) +{ + struct dma_buf *dmabuf; + int ret; + + ret = amdgpu_amdkfd_gpuvm_export_dmabuf(mem, &dmabuf); + if (ret) { + pr_err("dmabuf export failed for the BO\n"); + return ret; + } + + ret = dma_buf_fd(dmabuf, flags); + if (ret < 0) { + pr_err("dmabuf create fd failed, ret:%d\n", ret); + goto out_free_dmabuf; + } + + *shared_fd = ret; + return 0; + +out_free_dmabuf: + dma_buf_put(dmabuf); + return ret; +} + +static int criu_checkpoint_bos(struct kfd_process *p, + uint32_t num_bos, + uint8_t __user *user_bos, + uint8_t __user *user_priv_data, + uint64_t *priv_offset) +{ + struct kfd_criu_bo_bucket *bo_buckets; + struct kfd_criu_bo_priv_data *bo_privs; + int ret = 0, pdd_index, bo_index = 0, id; + void *mem; + + bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL); + if (!bo_buckets) + return -ENOMEM; + + bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL); + if (!bo_privs) { + ret = -ENOMEM; + goto exit; + } + + for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) { + struct kfd_process_device *pdd = p->pdds[pdd_index]; + struct amdgpu_bo *dumper_bo; + struct kgd_mem *kgd_mem; + + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + struct kfd_criu_bo_bucket *bo_bucket; + struct kfd_criu_bo_priv_data *bo_priv; + int i, dev_idx = 0; + + if (!mem) { + ret = -ENOMEM; + goto exit; + } + + kgd_mem = (struct kgd_mem *)mem; + dumper_bo = kgd_mem->bo; + + /* Skip checkpointing BOs that are used for Trap handler + * code and state. Currently, these BOs have a VA that + * is less GPUVM Base + */ + if (kgd_mem->va && kgd_mem->va <= pdd->gpuvm_base) + continue; + + bo_bucket = &bo_buckets[bo_index]; + bo_priv = &bo_privs[bo_index]; + + bo_bucket->gpu_id = pdd->user_gpu_id; + bo_bucket->addr = (uint64_t)kgd_mem->va; + bo_bucket->size = amdgpu_bo_size(dumper_bo); + bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags; + bo_priv->idr_handle = id; + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { + ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo, + &bo_priv->user_addr); + if (ret) { + pr_err("Failed to obtain user address for user-pointer bo\n"); + goto exit; + } + } + if (bo_bucket->alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + ret = criu_get_prime_handle(kgd_mem, + bo_bucket->alloc_flags & + KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0, + &bo_bucket->dmabuf_fd); + if (ret) + goto exit; + } else { + bo_bucket->dmabuf_fd = KFD_INVALID_FD; + } + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) + bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL | + KFD_MMAP_GPU_ID(pdd->dev->id); + else if (bo_bucket->alloc_flags & + KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) + bo_bucket->offset = KFD_MMAP_TYPE_MMIO | + KFD_MMAP_GPU_ID(pdd->dev->id); + else + bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo); + + for (i = 0; i < p->n_pdds; i++) { + if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem)) + bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id; + } + + pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n" + "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x", + bo_bucket->size, + bo_bucket->addr, + bo_bucket->offset, + bo_bucket->gpu_id, + bo_bucket->alloc_flags, + bo_priv->idr_handle); + bo_index++; + } + } + + ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets)); + if (ret) { + pr_err("Failed to copy BO information to user\n"); + ret = -EFAULT; + goto exit; + } + + ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs)); + if (ret) { + pr_err("Failed to copy BO priv information to user\n"); + ret = -EFAULT; + goto exit; + } + + *priv_offset += num_bos * sizeof(*bo_privs); + +exit: + while (ret && bo_index--) { + if (bo_buckets[bo_index].alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) + close_fd(bo_buckets[bo_index].dmabuf_fd); + } + + kvfree(bo_buckets); + kvfree(bo_privs); + return ret; +} + +static int criu_get_process_object_info(struct kfd_process *p, + uint32_t *num_devices, + uint32_t *num_bos, + uint32_t *num_objects, + uint64_t *objs_priv_size) +{ + uint64_t queues_priv_data_size, svm_priv_data_size, priv_size; + uint32_t num_queues, num_events, num_svm_ranges; + int ret; + + *num_devices = p->n_pdds; + *num_bos = get_process_num_bos(p); + + ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size); + if (ret) + return ret; + + num_events = kfd_get_num_events(p); + + ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size); + if (ret) + return ret; + + *num_objects = num_queues + num_events + num_svm_ranges; + + if (objs_priv_size) { + priv_size = sizeof(struct kfd_criu_process_priv_data); + priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data); + priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data); + priv_size += queues_priv_data_size; + priv_size += num_events * sizeof(struct kfd_criu_event_priv_data); + priv_size += svm_priv_data_size; + *objs_priv_size = priv_size; + } + return 0; +} + +int kfd_criu_checkpoint(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + int ret; + uint32_t num_devices, num_bos, num_objects; + uint64_t priv_size, priv_offset = 0, bo_priv_offset; + + if (!args->devices || !args->bos || !args->priv_data) + return -EINVAL; + + mutex_lock(&p->mutex); + + if (!p->n_pdds) { + pr_err("No pdd for given process\n"); + ret = -ENODEV; + goto exit_unlock; + } + + /* Confirm all process queues are evicted */ + if (!p->queues_paused) { + pr_err("Cannot dump process when queues are not in evicted state\n"); + /* CRIU plugin did not call op PROCESS_INFO before checkpointing */ + ret = -EINVAL; + goto exit_unlock; + } + + ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size); + if (ret) + goto exit_unlock; + + if (num_devices != args->num_devices || + num_bos != args->num_bos || + num_objects != args->num_objects || + priv_size != args->priv_data_size) { + + ret = -EINVAL; + goto exit_unlock; + } + + /* each function will store private data inside priv_data and adjust priv_offset */ + ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset); + if (ret) + goto exit_unlock; + + ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices, + (uint8_t __user *)args->priv_data, &priv_offset); + if (ret) + goto exit_unlock; + + /* Leave room for BOs in the private data. They need to be restored + * before events, but we checkpoint them last to simplify the error + * handling. + */ + bo_priv_offset = priv_offset; + priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data); + + if (num_objects) { + ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data, + &priv_offset); + if (ret) + goto exit_unlock; + + ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data, + &priv_offset); + if (ret) + goto exit_unlock; + + ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset); + if (ret) + goto exit_unlock; + } + + /* This must be the last thing in this function that can fail. + * Otherwise we leak dmabuf file descriptors. + */ + ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos, + (uint8_t __user *)args->priv_data, &bo_priv_offset); + +exit_unlock: + mutex_unlock(&p->mutex); + if (ret) + pr_err("Failed to dump CRIU ret:%d\n", ret); + else + pr_debug("CRIU dump ret:%d\n", ret); + + return ret; +} + +static int criu_restore_process(struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + int ret = 0; + struct kfd_criu_process_priv_data process_priv; + + if (*priv_offset + sizeof(process_priv) > max_priv_data_size) + return -EINVAL; + + ret = copy_from_user(&process_priv, + (void __user *)(args->priv_data + *priv_offset), + sizeof(process_priv)); + if (ret) { + pr_err("Failed to copy process private information from user\n"); + ret = -EFAULT; + goto exit; + } + *priv_offset += sizeof(process_priv); + + if (process_priv.version != KFD_CRIU_PRIV_VERSION) { + pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n", + process_priv.version, KFD_CRIU_PRIV_VERSION); + return -EINVAL; + } + + pr_debug("Setting XNACK mode\n"); + if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) { + pr_err("xnack mode cannot be set\n"); + ret = -EPERM; + goto exit; + } else { + pr_debug("set xnack mode: %d\n", process_priv.xnack_mode); + p->xnack_enabled = process_priv.xnack_mode; + } + +exit: + return ret; +} + +static int criu_restore_devices(struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + struct kfd_criu_device_bucket *device_buckets; + struct kfd_criu_device_priv_data *device_privs; + int ret = 0; + uint32_t i; + + if (args->num_devices != p->n_pdds) + return -EINVAL; + + if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size) + return -EINVAL; + + device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL); + if (!device_buckets) + return -ENOMEM; + + ret = copy_from_user(device_buckets, (void __user *)args->devices, + args->num_devices * sizeof(*device_buckets)); + if (ret) { + pr_err("Failed to copy devices buckets from user\n"); + ret = -EFAULT; + goto exit; + } + + for (i = 0; i < args->num_devices; i++) { + struct kfd_node *dev; + struct kfd_process_device *pdd; + struct file *drm_file; + + /* device private data is not currently used */ + + if (!device_buckets[i].user_gpu_id) { + pr_err("Invalid user gpu_id\n"); + ret = -EINVAL; + goto exit; + } + + dev = kfd_device_by_id(device_buckets[i].actual_gpu_id); + if (!dev) { + pr_err("Failed to find device with gpu_id = %x\n", + device_buckets[i].actual_gpu_id); + ret = -EINVAL; + goto exit; + } + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Failed to get pdd for gpu_id = %x\n", + device_buckets[i].actual_gpu_id); + ret = -EINVAL; + goto exit; + } + pdd->user_gpu_id = device_buckets[i].user_gpu_id; + + drm_file = fget(device_buckets[i].drm_fd); + if (!drm_file) { + pr_err("Invalid render node file descriptor sent from plugin (%d)\n", + device_buckets[i].drm_fd); + ret = -EINVAL; + goto exit; + } + + if (pdd->drm_file) { + ret = -EINVAL; + goto exit; + } + + /* create the vm using render nodes for kfd pdd */ + if (kfd_process_device_init_vm(pdd, drm_file)) { + pr_err("could not init vm for given pdd\n"); + /* On success, the PDD keeps the drm_file reference */ + fput(drm_file); + ret = -EINVAL; + goto exit; + } + /* + * pdd now already has the vm bound to render node so below api won't create a new + * exclusive kfd mapping but use existing one with renderDXXX but is still needed + * for iommu v2 binding and runtime pm. + */ + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + ret = PTR_ERR(pdd); + goto exit; + } + + if (!pdd->qpd.proc_doorbells) { + ret = kfd_alloc_process_doorbells(dev->kfd, pdd); + if (ret) + goto exit; + } + } + + /* + * We are not copying device private data from user as we are not using the data for now, + * but we still adjust for its private data. + */ + *priv_offset += args->num_devices * sizeof(*device_privs); + +exit: + kfree(device_buckets); + return ret; +} + +static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, + struct kfd_criu_bo_bucket *bo_bucket, + struct kfd_criu_bo_priv_data *bo_priv, + struct kgd_mem **kgd_mem) +{ + int idr_handle; + int ret; + const bool criu_resume = true; + u64 offset; + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { + if (bo_bucket->size != + kfd_doorbell_process_slice(pdd->dev->kfd)) + return -EINVAL; + + offset = kfd_get_process_doorbells(pdd); + if (!offset) + return -ENOMEM; + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + /* MMIO BOs need remapped bus address */ + if (bo_bucket->size != PAGE_SIZE) { + pr_err("Invalid page size\n"); + return -EINVAL; + } + offset = pdd->dev->adev->rmmio_remap.bus_addr; + if (!offset) { + pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n"); + return -ENOMEM; + } + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { + offset = bo_priv->user_addr; + } + /* Create the BO */ + ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr, + bo_bucket->size, pdd->drm_priv, kgd_mem, + &offset, bo_bucket->alloc_flags, criu_resume); + if (ret) { + pr_err("Could not create the BO\n"); + return ret; + } + pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n", + bo_bucket->size, bo_bucket->addr, offset); + + /* Restore previous IDR handle */ + pr_debug("Restoring old IDR handle for the BO"); + idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle, + bo_priv->idr_handle + 1, GFP_KERNEL); + + if (idr_handle < 0) { + pr_err("Could not allocate idr\n"); + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv, + NULL); + return -ENOMEM; + } + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) + bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id); + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id); + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) { + bo_bucket->restored_offset = offset; + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { + bo_bucket->restored_offset = offset; + /* Update the VRAM usage count */ + WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + } + return 0; +} + +static int criu_restore_bo(struct kfd_process *p, + struct kfd_criu_bo_bucket *bo_bucket, + struct kfd_criu_bo_priv_data *bo_priv) +{ + struct kfd_process_device *pdd; + struct kgd_mem *kgd_mem; + int ret; + int j; + + pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n", + bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags, + bo_priv->idr_handle); + + pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id); + if (!pdd) { + pr_err("Failed to get pdd\n"); + return -ENODEV; + } + + ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem); + if (ret) + return ret; + + /* now map these BOs to GPU/s */ + for (j = 0; j < p->n_pdds; j++) { + struct kfd_node *peer; + struct kfd_process_device *peer_pdd; + + if (!bo_priv->mapped_gpuids[j]) + break; + + peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]); + if (!peer_pdd) + return -EINVAL; + + peer = peer_pdd->dev; + + peer_pdd = kfd_bind_process_to_device(peer, p); + if (IS_ERR(peer_pdd)) + return PTR_ERR(peer_pdd); + + ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem, + peer_pdd->drm_priv); + if (ret) { + pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds); + return ret; + } + } + + pr_debug("map memory was successful for the BO\n"); + /* create the dmabuf object and export the bo */ + if (bo_bucket->alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + ret = criu_get_prime_handle(kgd_mem, DRM_RDWR, + &bo_bucket->dmabuf_fd); + if (ret) + return ret; + } else { + bo_bucket->dmabuf_fd = KFD_INVALID_FD; + } + + return 0; +} + +static int criu_restore_bos(struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + struct kfd_criu_bo_bucket *bo_buckets = NULL; + struct kfd_criu_bo_priv_data *bo_privs = NULL; + int ret = 0; + uint32_t i = 0; + + if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size) + return -EINVAL; + + /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */ + amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info); + + bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL); + if (!bo_buckets) + return -ENOMEM; + + ret = copy_from_user(bo_buckets, (void __user *)args->bos, + args->num_bos * sizeof(*bo_buckets)); + if (ret) { + pr_err("Failed to copy BOs information from user\n"); + ret = -EFAULT; + goto exit; + } + + bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL); + if (!bo_privs) { + ret = -ENOMEM; + goto exit; + } + + ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset, + args->num_bos * sizeof(*bo_privs)); + if (ret) { + pr_err("Failed to copy BOs information from user\n"); + ret = -EFAULT; + goto exit; + } + *priv_offset += args->num_bos * sizeof(*bo_privs); + + /* Create and map new BOs */ + for (; i < args->num_bos; i++) { + ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]); + if (ret) { + pr_debug("Failed to restore BO[%d] ret%d\n", i, ret); + goto exit; + } + } /* done */ + + /* Copy only the buckets back so user can read bo_buckets[N].restored_offset */ + ret = copy_to_user((void __user *)args->bos, + bo_buckets, + (args->num_bos * sizeof(*bo_buckets))); + if (ret) + ret = -EFAULT; + +exit: + while (ret && i--) { + if (bo_buckets[i].alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) + close_fd(bo_buckets[i].dmabuf_fd); + } + kvfree(bo_buckets); + kvfree(bo_privs); + return ret; +} + +static int criu_restore_objects(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + int ret = 0; + uint32_t i; + + BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type)); + BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type)); + BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type)); + + for (i = 0; i < args->num_objects; i++) { + uint32_t object_type; + + if (*priv_offset + sizeof(object_type) > max_priv_data_size) { + pr_err("Invalid private data size\n"); + return -EINVAL; + } + + ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset)); + if (ret) { + pr_err("Failed to copy private information from user\n"); + goto exit; + } + + switch (object_type) { + case KFD_CRIU_OBJECT_TYPE_QUEUE: + ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data, + priv_offset, max_priv_data_size); + if (ret) + goto exit; + break; + case KFD_CRIU_OBJECT_TYPE_EVENT: + ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data, + priv_offset, max_priv_data_size); + if (ret) + goto exit; + break; + case KFD_CRIU_OBJECT_TYPE_SVM_RANGE: + ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data, + priv_offset, max_priv_data_size); + if (ret) + goto exit; + break; + default: + pr_err("Invalid object type:%u at index:%d\n", object_type, i); + ret = -EINVAL; + goto exit; + } + } +exit: + return ret; +} + +int kfd_criu_restore(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + uint64_t priv_offset = 0; + int ret = 0; + + pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n", + args->num_devices, args->num_bos, args->num_objects, args->priv_data_size); + + if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size || + !args->num_devices || !args->num_bos) + return -EINVAL; + + mutex_lock(&p->mutex); + + /* + * Set the process to evicted state to avoid running any new queues before all the memory + * mappings are ready. + */ + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE); + if (ret) + goto exit_unlock; + + /* Each function will adjust priv_offset based on how many bytes they consumed */ + ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + if (priv_offset != args->priv_data_size) { + pr_err("Invalid private data size\n"); + ret = -EINVAL; + } + +exit_unlock: + mutex_unlock(&p->mutex); + if (ret) + pr_err("Failed to restore CRIU ret:%d\n", ret); + else + pr_debug("CRIU restore successful\n"); + + return ret; +} + +int kfd_criu_unpause(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + int ret; + + mutex_lock(&p->mutex); + + if (!p->queues_paused) { + mutex_unlock(&p->mutex); + return -EINVAL; + } + + ret = kfd_process_restore_queues(p); + if (ret) + pr_err("Failed to unpause queues ret:%d\n", ret); + else + p->queues_paused = false; + + mutex_unlock(&p->mutex); + + return ret; +} + +int kfd_criu_resume(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + struct kfd_process *target = NULL; + struct pid *pid = NULL; + int ret = 0; + + pr_debug("Inside %s, target pid for criu restore: %d\n", __func__, + args->pid); + + pid = find_get_pid(args->pid); + if (!pid) { + pr_err("Cannot find pid info for %i\n", args->pid); + return -ESRCH; + } + + pr_debug("calling kfd_lookup_process_by_pid\n"); + target = kfd_lookup_process_by_pid(pid); + + put_pid(pid); + + if (!target) { + pr_debug("Cannot find process info for %i\n", args->pid); + return -ESRCH; + } + + mutex_lock(&target->mutex); + ret = kfd_criu_resume_svm(target); + if (ret) { + pr_err("kfd_criu_resume_svm failed for %i\n", args->pid); + goto exit; + } + + ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info); + if (ret) + pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid); + +exit: + mutex_unlock(&target->mutex); + + kfd_unref_process(target); + return ret; +} + +int kfd_criu_process_info(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + int ret = 0; + + mutex_lock(&p->mutex); + + if (!p->n_pdds) { + pr_err("No pdd for given process\n"); + ret = -ENODEV; + goto err_unlock; + } + + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT); + if (ret) + goto err_unlock; + + p->queues_paused = true; + + args->pid = task_pid_nr_ns(p->lead_thread, + task_active_pid_ns(p->lead_thread)); + + ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos, + &args->num_objects, &args->priv_data_size); + if (ret) + goto err_unlock; + + dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n", + args->num_devices, args->num_bos, args->num_objects, + args->priv_data_size); + +err_unlock: + if (ret) { + kfd_process_restore_queues(p); + p->queues_paused = false; + } + mutex_unlock(&p->mutex); + return ret; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_criu.h b/drivers/gpu/drm/amd/amdkfd/kfd_criu.h new file mode 100644 index 000000000000..1a3d418a9505 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_criu.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __KFD_CRIU_H__ +#define __KFD_CRIU_H__ + +#include <uapi/linux/kfd_ioctl.h> +#include "kfd_priv.h" + +int kfd_criu_process_info(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args); + +int kfd_criu_checkpoint(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args); + +int kfd_criu_unpause(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args); + +int kfd_criu_restore(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args); + +int kfd_criu_resume(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args); + +#endif -- 2.34.1