Checkpoint-Restore in userspace (CRIU) is a powerful tool that can snapshot a running process and later restore it on same or a remote machine but expects the processes that have a device file (e.g. GPU) associated with them, provide necessary driver support to assist CRIU and its extensible plugin interface. Thus, In order to support the Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute Kernel driver, needs to provide a set of new APIs that provide necessary VRAM metadata and its contents to a userspace component (CRIU plugin) that can store it in form of image files. This introduces some new ioctls which will be used to checkpoint-Restore any KFD bound user process. KFD only allows ioctl calls from the same process that opened the KFD file descriptor. Since these ioctls are expected to be called from a KFD criu plugin which has elevated ptrace attached privileges and CAP_CHECKPOINT_RESTORE capabilities attached with the file descriptors so modify KFD to allow such calls. (API redesigned by David Yat Sin) Suggested-by: Felix Kuehling <felix.kuehling@xxxxxxx> Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> Signed-off-by: David Yat Sin <david.yatsin@xxxxxxx> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 98 +++++++++++++++++++++++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 65 +++++++++++++++- include/uapi/linux/kfd_ioctl.h | 81 +++++++++++++++++++- 3 files changed, 241 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 214a2c67fba4..90e6d9e335a5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -33,6 +33,7 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/ptrace.h> #include <linux/dma-buf.h> #include <asm/processor.h> #include "kfd_priv.h" @@ -1859,6 +1860,75 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) } #endif +static int criu_checkpoint(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + return 0; +} + +static int criu_restore(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + return 0; +} + +static int criu_unpause(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + return 0; +} + +static int criu_resume(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + return 0; +} + +static int criu_process_info(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + return 0; +} + +static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data) +{ + struct kfd_ioctl_criu_args *args = data; + int ret; + + dev_dbg(kfd_device, "CRIU operation: %d\n", args->op); + switch (args->op) { + case KFD_CRIU_OP_PROCESS_INFO: + ret = criu_process_info(filep, p, args); + break; + case KFD_CRIU_OP_CHECKPOINT: + ret = criu_checkpoint(filep, p, args); + break; + case KFD_CRIU_OP_UNPAUSE: + ret = criu_unpause(filep, p, args); + break; + case KFD_CRIU_OP_RESTORE: + ret = criu_restore(filep, p, args); + break; + case KFD_CRIU_OP_RESUME: + ret = criu_resume(filep, p, args); + break; + default: + dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op); + ret = -EINVAL; + break; + } + + if (ret) + dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, ret); + + return ret; +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1962,6 +2032,10 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE, kfd_ioctl_set_xnack_mode, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP, + kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE), + }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -1976,6 +2050,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) char *kdata = NULL; unsigned int usize, asize; int retcode = -EINVAL; + bool ptrace_attached = false; if (nr >= AMDKFD_CORE_IOCTL_COUNT) goto err_i1; @@ -2001,7 +2076,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) * processes need to create their own KFD device context. */ process = filep->private_data; - if (process->lead_thread != current->group_leader) { + + rcu_read_lock(); + if ((ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE) && + ptrace_parent(process->lead_thread) == current) + ptrace_attached = true; + rcu_read_unlock(); + + if (process->lead_thread != current->group_leader + && !ptrace_attached) { dev_dbg(kfd_device, "Using KFD FD in wrong process\n"); retcode = -EBADF; goto err_i1; @@ -2016,6 +2099,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) goto err_i1; } + /* + * Versions of docker shipped in Ubuntu 18.xx and 20.xx do not support + * CAP_CHECKPOINT_RESTORE, so we also allow access if CAP_SYS_ADMIN as CAP_SYS_ADMIN is a + * more priviledged access. + */ + if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) { + if (!capable(CAP_CHECKPOINT_RESTORE) && + !capable(CAP_SYS_ADMIN)) { + retcode = -EACCES; + goto err_i1; + } + } + if (cmd & (IOC_IN | IOC_OUT)) { if (asize <= sizeof(stack_kdata)) { kdata = stack_kdata; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index ea68f3b3a4e9..f928878196ef 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -121,7 +121,26 @@ */ #define KFD_QUEUE_DOORBELL_MIRROR_OFFSET 512 - +/** + * enum kfd_ioctl_flags - KFD ioctl flags + * Various flags that can be set in &amdkfd_ioctl_desc.flags to control how + * userspace can use a given ioctl. + */ +enum kfd_ioctl_flags { + /* + * @KFD_IOC_FLAG_CHECKPOINT_RESTORE: + * Certain KFD ioctls such as AMDKFD_IOC_CRIU_OP can potentially + * perform privileged operations and load arbitrary data into MQDs and + * eventually HQD registers when the queue is mapped by HWS. In order to + * prevent this we should perform additional security checks. + * + * This is equivalent to callers with the CHECKPOINT_RESTORE capability. + * + * Note: Since earlier versions of docker do not support CHECKPOINT_RESTORE, + * we also allow ioctls with SYS_ADMIN capability. + */ + KFD_IOC_FLAG_CHECKPOINT_RESTORE = BIT(0), +}; /* * Kernel module parameter to specify maximum number of supported queues per * device @@ -1006,6 +1025,50 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd, uint64_t tba_addr, uint64_t tma_addr); +/* CRIU */ +/* + * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private + * structures: + * kfd_criu_process_priv_data + * kfd_criu_device_priv_data + * kfd_criu_bo_priv_data + * kfd_criu_queue_priv_data + * kfd_criu_event_priv_data + * kfd_criu_svm_range_priv_data + */ + +#define KFD_CRIU_PRIV_VERSION 1 + +struct kfd_criu_process_priv_data { + uint32_t version; +}; + +struct kfd_criu_device_priv_data { + /* For future use */ + uint64_t reserved; +}; + +struct kfd_criu_bo_priv_data { + uint64_t reserved; +}; + +struct kfd_criu_svm_range_priv_data { + uint32_t object_type; + uint32_t reserved; +}; + +struct kfd_criu_queue_priv_data { + uint32_t object_type; + uint32_t reserved; +}; + +struct kfd_criu_event_priv_data { + uint32_t object_type; + uint32_t reserved; +}; + +/* CRIU - End */ + /* Queue Context Management */ int init_queue(struct queue **q, const struct queue_properties *properties); void uninit_queue(struct queue *q); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index af96af174dc4..49429a6c42fc 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -468,6 +468,82 @@ struct kfd_ioctl_smi_events_args { __u32 anon_fd; /* from KFD */ }; +/************************************************************************************************** + * CRIU IOCTLs (Checkpoint Restore In Userspace) + * + * When checkpointing a process, the userspace application will perform: + * 1. PROCESS_INFO op to determine current process information. This pauses execution and evicts + * all the queues. + * 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges) + * 3. UNPAUSE op to un-evict all the queues + * + * When restoring a process, the CRIU userspace application will perform: + * + * 1. RESTORE op to restore process contents + * 2. RESUME op to start the process + * + * Note: Queues are forced into an evicted state after a successful PROCESS_INFO. User + * application needs to perform an UNPAUSE operation after calling PROCESS_INFO. + */ + +enum kfd_criu_op { + KFD_CRIU_OP_PROCESS_INFO, + KFD_CRIU_OP_CHECKPOINT, + KFD_CRIU_OP_UNPAUSE, + KFD_CRIU_OP_RESTORE, + KFD_CRIU_OP_RESUME, +}; + +/** + * kfd_ioctl_criu_args - Arguments perform CRIU operation + * @devices: [in/out] User pointer to memory location for devices information. + * This is an array of type kfd_criu_device_bucket. + * @bos: [in/out] User pointer to memory location for BOs information + * This is an array of type kfd_criu_bo_bucket. + * @priv_data: [in/out] User pointer to memory location for private data + * @priv_data_size: [in/out] Size of priv_data in bytes + * @num_devices: [in/out] Number of GPUs used by process. Size of @devices array. + * @num_bos [in/out] Number of BOs used by process. Size of @bos array. + * @num_objects: [in/out] Number of objects used by process. Objects are opaque to + * user application. + * @pid: [in/out] PID of the process being checkpointed + * @op [in] Type of operation (kfd_criu_op) + * + * Return: 0 on success, -errno on failure + */ +struct kfd_ioctl_criu_args { + __u64 devices; /* Used during ops: CHECKPOINT, RESTORE */ + __u64 bos; /* Used during ops: CHECKPOINT, RESTORE */ + __u64 priv_data; /* Used during ops: CHECKPOINT, RESTORE */ + __u64 priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 pid; /* Used during ops: PROCESS_INFO, RESUME */ + __u32 op; +}; + +struct kfd_criu_device_bucket { + __u32 user_gpu_id; + __u32 actual_gpu_id; + __u32 drm_fd; + __u32 pad; +}; + +struct kfd_criu_bo_bucket { + __u64 addr; + __u64 size; + __u64 offset; + __u64 restored_offset; /* During restore, updated offset for BO */ + __u32 gpu_id; /* This is the user_gpu_id */ + __u32 alloc_flags; + __u32 dmabuf_fd; + __u32 pad; +}; + +/* CRIU IOCTLs - END */ +/**************************************************************************************************/ + /* Register offset inside the remapped mmio page */ enum kfd_mmio_remap { @@ -742,7 +818,10 @@ struct kfd_ioctl_set_xnack_mode_args { #define AMDKFD_IOC_SET_XNACK_MODE \ AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args) +#define AMDKFD_IOC_CRIU_OP \ + AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x22 +#define AMDKFD_COMMAND_END 0x23 #endif -- 2.17.1