Am 2021-08-19 um 9:36 a.m. schrieb David Yat Sin: > From: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx> > > Checkpoint-Restore in userspace (CRIU) is a powerful tool that can > snapshot a running process and later restore it on same or a remote > machine but expects the processes that have a device file (e.g. GPU) > associated with them, provide necessary driver support to assist CRIU > and its extensible plugin interface. Thus, In order to support the > Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute > Kernel driver, needs to provide a set of new APIs that provide > necessary VRAM metadata and its contents to a userspace component > (CRIU plugin) that can store it in form of image files. > > This introduces some new ioctls which will be used to checkpoint-Restore > any KFD bound user process. KFD doesn't allow any arbitrary ioctl call > unless it is called by the group leader process. Since these ioctls are > expected to be called from a KFD criu plugin which has elevated ptrace > attached privileges and CAP_SYS_ADMIN capabilities attached with the file > descriptors so modify KFD to allow such calls. > > (API redesign suggested by Felix Kuehling and implemented by David Yat > Sin) > > Signed-off-by: David Yat Sin <david.yatsin@xxxxxxx> > Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx> > (cherry picked from commit 72f4907135aed9c037b9f442a6055b51733b518a) > (cherry picked from commit 33ff4953c5352f51d57a77ba8ae6614b7993e70d) > --- > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 70 ++++++++++++++- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 69 ++++++++++++++ > include/uapi/linux/kfd_ioctl.h | 110 ++++++++++++++++++++++- > 3 files changed, 247 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index 059c3f1ca27d..a1b60d29aae1 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -33,6 +33,7 @@ > #include <linux/time.h> > #include <linux/mm.h> > #include <linux/mman.h> > +#include <linux/ptrace.h> > #include <linux/dma-buf.h> > #include <asm/processor.h> > #include "kfd_priv.h" > @@ -1802,6 +1803,44 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) > return -EPERM; > } > #endif > +static int kfd_ioctl_criu_dumper(struct file *filep, > + struct kfd_process *p, void *data) > +{ > + pr_debug("Inside %s\n", __func__); > + > + return 0; > +} > + > +static int kfd_ioctl_criu_restorer(struct file *filep, > + struct kfd_process *p, void *data) > +{ > + pr_debug("Inside %s\n", __func__); > + > + return 0; > +} > + > +static int kfd_ioctl_criu_pause(struct file *filep, struct kfd_process *p, void *data) > +{ > + pr_debug("Inside %s\n", __func__); > + > + return 0; > +} > + > +static int kfd_ioctl_criu_resume(struct file *filep, > + struct kfd_process *p, void *data) > +{ > + pr_debug("Inside %s\n", __func__); > + > + return 0; > +} > + > +static int kfd_ioctl_criu_process_info(struct file *filep, > + struct kfd_process *p, void *data) > +{ > + pr_debug("Inside %s\n", __func__); > + > + return 0; > +} > > #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ > [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ > @@ -1906,6 +1945,21 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { > > AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE, > kfd_ioctl_set_xnack_mode, 0), > + > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_DUMPER, > + kfd_ioctl_criu_dumper, KFD_IOC_FLAG_PTRACE_ATTACHED), > + > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_RESTORER, > + kfd_ioctl_criu_restorer, KFD_IOC_FLAG_ROOT_ONLY), > + > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_PROCESS_INFO, > + kfd_ioctl_criu_process_info, KFD_IOC_FLAG_PTRACE_ATTACHED), > + > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_RESUME, > + kfd_ioctl_criu_resume, KFD_IOC_FLAG_ROOT_ONLY), > + > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_PAUSE, > + kfd_ioctl_criu_pause, KFD_IOC_FLAG_PTRACE_ATTACHED), > }; > > #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) > @@ -1920,6 +1974,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) > char *kdata = NULL; > unsigned int usize, asize; > int retcode = -EINVAL; > + bool ptrace_attached = false; > > if (nr >= AMDKFD_CORE_IOCTL_COUNT) > goto err_i1; > @@ -1945,7 +2000,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) > * processes need to create their own KFD device context. > */ > process = filep->private_data; > - if (process->lead_thread != current->group_leader) { > + > + rcu_read_lock(); > + if ((ioctl->flags & KFD_IOC_FLAG_PTRACE_ATTACHED) && > + ptrace_parent(process->lead_thread) == current) > + ptrace_attached = true; > + rcu_read_unlock(); > + > + if (process->lead_thread != current->group_leader > + && !ptrace_attached) { > dev_dbg(kfd_device, "Using KFD FD in wrong process\n"); > retcode = -EBADF; > goto err_i1; > @@ -1960,6 +2023,11 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) > goto err_i1; > } > > + /* KFD_IOC_FLAG_ROOT_ONLY is only for CAP_SYS_ADMIN */ > + if (unlikely((ioctl->flags & KFD_IOC_FLAG_ROOT_ONLY) && > + !capable(CAP_SYS_ADMIN))) > + return -EACCES; > + > if (cmd & (IOC_IN | IOC_OUT)) { > if (asize <= sizeof(stack_kdata)) { > kdata = stack_kdata; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 64552f6b8ba4..768cc3fe95d2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -121,7 +121,35 @@ > */ > #define KFD_QUEUE_DOORBELL_MIRROR_OFFSET 512 > > +/** > + * enum kfd_ioctl_flags - KFD ioctl flags > + * Various flags that can be set in &amdkfd_ioctl_desc.flags to control how > + * userspace can use a given ioctl. > + */ > +enum kfd_ioctl_flags { > + /** > + * @KFD_IOC_FLAG_ROOT_ONLY: > + * Certain KFD ioctls such as AMDKFD_IOC_CRIU_RESTORER can potentially > + * perform privileged operations and load arbitrary data into MQDs and > + * eventually HQD registers when the queue is mapped by HWS. In order to > + * prevent this we should perform additional security checks. In other > + * cases, certain ioctls such as AMDKFD_IOC_CRIU_RESUME might be called > + * by an external process e.g. CRIU restore process, for each resuming > + * tasks and thus require elevated privileges. > + * > + * This is equivalent to callers with the SYSADMIN capability. > + */ > + KFD_IOC_FLAG_ROOT_ONLY = BIT(0), > + /** > + * @KFD_IOC_FLAG_PTRACE_ATTACHED: > + * Certain KFD ioctls such as AMDKFD_IOC_CRIU_HELPER and > + * AMDKFD_IOC_CRIU_DUMPER are expected to be called during a Checkpoint > + * operation triggered by CRIU. Since, these are expected to be called > + * from a PTRACE attached context, we must authenticate these. > + */ > + KFD_IOC_FLAG_PTRACE_ATTACHED = BIT(1), > > +}; > /* > * Kernel module parameter to specify maximum number of supported queues per > * device > @@ -977,6 +1005,47 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd, > uint64_t tba_addr, > uint64_t tma_addr); > > +/* CRIU */ > +/* > + * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private > + * structures: > + * kfd_criu_process_priv_data > + * kfd_criu_device_priv_data > + * kfd_criu_bo_priv_data > + * kfd_criu_queue_priv_data > + * kfd_criu_event_priv_data > + * kfd_criu_svm_range_priv_data > + */ > + > +#define KFD_CRIU_PRIV_VERSION 1 > + > +struct kfd_criu_process_priv_data { > + uint32_t version; > +}; > + > +struct kfd_criu_device_priv_data { > + /* For future use */ > + uint64_t reserved; > +}; > + > +struct kfd_criu_bo_priv_data { > + uint64_t reserved; > +}; > + > +struct kfd_criu_svm_range_priv_data { > + uint64_t reserved; > +}; > + > +struct kfd_criu_queue_priv_data { > + uint64_t reserved; > +}; > + > +struct kfd_criu_event_priv_data { > + uint64_t reserved; > +}; > + > +/* CRIU - End */ > + > /* Queue Context Management */ > int init_queue(struct queue **q, const struct queue_properties *properties); > void uninit_queue(struct queue *q); > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > index 3cb5b5dd9f77..19489e2ca58e 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -467,6 +467,99 @@ struct kfd_ioctl_smi_events_args { > __u32 anon_fd; /* from KFD */ > }; > > +struct kfd_criu_process_bucket { > + __u64 priv_data_offset; > + __u64 priv_data_size; > +}; > + > +struct kfd_criu_device_bucket { > + __u64 priv_data_offset; > + __u64 priv_data_size; > + __u32 user_gpu_id; > + __u32 actual_gpu_id; > + __u32 drm_fd; > + __u32 pad; > +}; > + > +struct kfd_criu_bo_bucket { > + __u64 priv_data_offset; > + __u64 priv_data_size; > + __u64 addr; > + __u64 size; > + __u64 offset; > + __u64 restored_offset; > + __u32 gpu_id; > + __u32 alloc_flags; > + __u32 dmabuf_fd; > + __u32 pad; > +}; > + > +struct kfd_criu_queue_bucket { > + __u64 priv_data_offset; > + __u64 priv_data_size; > + __u32 gpu_id; > + __u32 pad; > +}; > + > +struct kfd_criu_event_bucket { > + __u64 priv_data_offset; > + __u64 priv_data_size; > + __u32 gpu_id; > + __u32 pad; > +}; > + > +struct kfd_ioctl_criu_process_info_args { > + __u64 process_priv_data_size; > + __u64 bos_priv_data_size; > + __u64 devices_priv_data_size; > + __u64 queues_priv_data_size; > + __u64 events_priv_data_size; > + __u64 svm_ranges_priv_data_size; > + __u64 total_bos; > + __u64 total_svm_ranges; > + __u32 total_devices; > + __u32 total_queues; > + __u32 total_events; > + __u32 task_pid; > +}; > + > +struct kfd_ioctl_criu_pause_args { > + __u32 pause; > + __u32 pad; > +}; > + > +enum kfd_criu_object_type { > + KFD_CRIU_OBJECT_TYPE_PROCESS = 0, > + KFD_CRIU_OBJECT_TYPE_DEVICE = 1, > + KFD_CRIU_OBJECT_TYPE_BO = 2, > + KFD_CRIU_OBJECT_TYPE_QUEUE = 3, > + KFD_CRIU_OBJECT_TYPE_EVENT = 4, > + KFD_CRIU_OBJECT_TYPE_SVM_RANGE = 5, > +}; > + Please add comments explaining the members of the ioctl args structures. E.g. it's not obvious that objects is a user mode pointer, or the semantics of the objects_index_start field. Regards, Felix > +struct kfd_ioctl_criu_dumper_args { > + __u64 num_objects; > + __u64 objects; > + __u64 objects_size; > + __u64 objects_index_start; > + __u32 type; /* enum kfd_criu_object_type */ > + __u32 pad; > +}; > + > +struct kfd_ioctl_criu_restorer_args { > + __u64 num_objects; > + __u64 objects; > + __u64 objects_size; > + __u64 objects_index_start; > + __u32 type; /* enum kfd_criu_object_type */ > + __u32 pad; > +}; > + > +struct kfd_ioctl_criu_resume_args { > + __u32 pid; /* to KFD */ > + __u32 pad; > +}; > + > /* Register offset inside the remapped mmio page > */ > enum kfd_mmio_remap { > @@ -740,7 +833,22 @@ struct kfd_ioctl_set_xnack_mode_args { > #define AMDKFD_IOC_SET_XNACK_MODE \ > AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args) > > +#define AMDKFD_IOC_CRIU_DUMPER \ > + AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_dumper_args) > + > +#define AMDKFD_IOC_CRIU_RESTORER \ > + AMDKFD_IOWR(0x23, struct kfd_ioctl_criu_restorer_args) > + > +#define AMDKFD_IOC_CRIU_PROCESS_INFO \ > + AMDKFD_IOWR(0x24, struct kfd_ioctl_criu_process_info_args) > + > +#define AMDKFD_IOC_CRIU_RESUME \ > + AMDKFD_IOWR(0x25, struct kfd_ioctl_criu_resume_args) > + > +#define AMDKFD_IOC_CRIU_PAUSE \ > + AMDKFD_IOWR(0x26, struct kfd_ioctl_criu_pause_args) > + > #define AMDKFD_COMMAND_START 0x01 > -#define AMDKFD_COMMAND_END 0x22 > +#define AMDKFD_COMMAND_END 0x27 > > #endif