Hi Chris, A few questions/comments throughout. I may be off the mark on some. Please bear with me as I try to get more familiar with the gem code. Thanks, Brad [ snip ] On Fri, Jan 24, 2014 at 01:00:19AM -0800, Chris Wilson wrote: > +static void > +__i915_mmu_notifier_destroy_worker(struct work_struct *work) > +{ > + struct i915_mmu_notifier *mmu = container_of(work, typeof(*mmu), work); > + mmu_notifier_unregister(&mmu->mn, mmu->mm); > + kfree(mmu); > +} > + > +static void > +__i915_mmu_notifier_destroy(struct i915_mmu_notifier *mmu) > +{ > + hash_del(&mmu->node); > + INIT_WORK(&mmu->work, __i915_mmu_notifier_destroy_worker); > + schedule_work(&mmu->work); The commit message mentions a potential lock inversion as the reason for using a wq. A comment with the details might be helpful. > +} > + > +static void __i915_mmu_notifier_update_serial(struct i915_mmu_notifier *mmu) > +{ > + if (++mmu->serial == 0) > + mmu->serial = 1; > +} > + > +static void > +i915_mmu_notifier_del(struct i915_mmu_notifier *mmu, > + struct i915_mmu_object *mn) > +{ > + bool destroy; > + > + spin_lock(&mmu->lock); > + interval_tree_remove(&mn->it, &mmu->objects); > + destroy = --mmu->count == 0; > + __i915_mmu_notifier_update_serial(mmu); > + spin_unlock(&mmu->lock); > + > + if (destroy) /* protected against _add() by struct_mutex */ > + __i915_mmu_notifier_destroy(mmu); I see that we should hold struct_mutex when this function is called, but I don't see that we try to get the mutex anywhere on the _add() path. Have I missed something? > +} > + > +static int > +i915_mmu_notifier_add(struct i915_mmu_notifier *mmu, > + struct i915_mmu_object *mn) > +{ > + int ret = -EINVAL; > + > + spin_lock(&mmu->lock); > + /* Disallow overlapping userptr objects */ > + if (!interval_tree_iter_first(&mmu->objects, > + mn->it.start, mn->it.last)) { > + interval_tree_insert(&mn->it, &mmu->objects); > + mmu->count++; > + __i915_mmu_notifier_update_serial(mmu); > + ret = 0; > + } > + spin_unlock(&mmu->lock); > + > + return ret; > +} > + > +static void > +i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj) > +{ > + struct i915_mmu_object *mn; > + > + mn = obj->userptr.mn; > + if (mn == NULL) > + return; > + > + i915_mmu_notifier_del(mn->mmu, mn); > + obj->userptr.mn = NULL; > +} > + > +static int > +i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object *obj, > + unsigned flags) > +{ > + struct i915_mmu_notifier *mmu; > + struct i915_mmu_object *mn; > + int ret; > + > + if (flags & I915_USERPTR_UNSYNCHRONIZED) > + return capable(CAP_SYS_ADMIN) ? 0 : -EPERM; > + > + mmu = i915_mmu_notifier_get(obj->base.dev, obj->userptr.mm); > + if (IS_ERR(mmu)) > + return PTR_ERR(mmu); > + > + mn = kzalloc(sizeof(*mn), GFP_KERNEL); > + if (mn == NULL) { > + ret = -ENOMEM; > + goto destroy_mmu; > + } > + > + mn->mmu = mmu; > + mn->it.start = obj->userptr.ptr; > + mn->it.last = mn->it.start + obj->base.size - 1; > + mn->obj = obj; > + > + ret = i915_mmu_notifier_add(mmu, mn); > + if (ret) > + goto free_mn; > + > + obj->userptr.mn = mn; > + return 0; > + > +free_mn: > + kfree(mn); > +destroy_mmu: > + if (mmu->count == 0) > + __i915_mmu_notifier_destroy(mmu); Other accesses to mmu->count are protected by mmu->lock. Again, I may have missed something but don't immediately see why that's not required. > + return ret; > +} > + > +#else > + > +static void > +i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj) > +{ > +} > + > +static int > +i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object *obj, > + unsigned flags) > +{ > + if ((flags & I915_USERPTR_UNSYNCHRONIZED) == 0) > + return -ENODEV; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + return 0; > +} > +#endif > + > +struct get_pages_work { > + struct work_struct work; > + struct drm_i915_gem_object *obj; > + struct task_struct *task; > +}; > + > +static void > +__i915_gem_userptr_get_pages_worker(struct work_struct *_work) > +{ > + struct get_pages_work *work = container_of(_work, typeof(*work), work); > + struct drm_i915_gem_object *obj = work->obj; > + struct drm_device *dev = obj->base.dev; > + const int num_pages = obj->base.size >> PAGE_SHIFT; > + struct page **pvec; > + int pinned, ret; > + > + ret = -ENOMEM; > + pinned = 0; > + > + pvec = kmalloc(num_pages*sizeof(struct page *), > + GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY); > + if (pvec == NULL) > + pvec = drm_malloc_ab(num_pages, sizeof(struct page *)); > + if (pvec != NULL) { > + struct mm_struct *mm = obj->userptr.mm; > + > + use_mm(mm); > + down_read(&mm->mmap_sem); > + while (pinned < num_pages) { > + ret = get_user_pages(work->task, mm, > + obj->userptr.ptr + pinned * PAGE_SIZE, > + num_pages - pinned, > + !obj->userptr.read_only, 0, > + pvec + pinned, NULL); > + if (ret < 0) > + break; > + > + pinned += ret; > + } > + up_read(&mm->mmap_sem); > + unuse_mm(mm); > + } > + > + mutex_lock(&dev->struct_mutex); > + if (obj->userptr.work != &work->work) { > + ret = 0; > + } else if (pinned == num_pages) { > + struct sg_table *st; > + > + st = kmalloc(sizeof(*st), GFP_KERNEL); > + if (st == NULL || sg_alloc_table(st, num_pages, GFP_KERNEL)) { > + kfree(st); > + ret = -ENOMEM; > + } else { > + struct scatterlist *sg; > + int n; > + > + for_each_sg(st->sgl, sg, num_pages, n) > + sg_set_page(sg, pvec[n], PAGE_SIZE, 0); > + > + obj->pages = st; > + list_add_tail(&obj->global_list, &to_i915(dev)->mm.unbound_list); > + pinned = 0; > + ret = 0; > + } > + } > + > + obj->userptr.work = ERR_PTR(ret); > + drm_gem_object_unreference(&obj->base); > + mutex_unlock(&dev->struct_mutex); > + > + release_pages(pvec, pinned, 0); > + drm_free_large(pvec); > + > + put_task_struct(work->task); > + kfree(work); > +} > + > +static int > +i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) > +{ > + const int num_pages = obj->base.size >> PAGE_SHIFT; > + struct page **pvec; > + int pinned, ret; > + > + /* If userspace should engineer that these pages are replaced in > + * the vma between us binding this page into the GTT and completion > + * of rendering... Their loss. If they change the mapping of their > + * pages they need to create a new bo to point to the new vma. > + * > + * However, that still leaves open the possibility of the vma > + * being copied upon fork. Which falls under the same userspace > + * synchronisation issue as a regular bo, except that this time > + * the process may not be expecting that a particular piece of > + * memory is tied to the GPU. > + * > + * Fortunately, we can hook into the mmu_notifier in order to > + * discard the page references prior to anything nasty happening > + * to the vma (discard or cloning) which should prevent the more > + * egregious cases from causing harm. > + */ > + > + pvec = NULL; > + pinned = 0; > + if (obj->userptr.mm == current->mm) { > + pvec = kmalloc(num_pages*sizeof(struct page *), > + GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY); > + if (pvec == NULL) { > + pvec = drm_malloc_ab(num_pages, sizeof(struct page *)); > + if (pvec == NULL) > + return -ENOMEM; > + } > + > + pinned = __get_user_pages_fast(obj->userptr.ptr, num_pages, > + !obj->userptr.read_only, pvec); > + } > + if (pinned < num_pages) { > + if (pinned < 0) { > + ret = pinned; > + pinned = 0; > + } else { > + /* Spawn a worker so that we can acquire the > + * user pages without holding our mutex. > + */ > + ret = -EAGAIN; > + if (obj->userptr.work == NULL) { > + struct get_pages_work *work; > + > + work = kmalloc(sizeof(*work), GFP_KERNEL); > + if (work != NULL) { > + obj->userptr.work = &work->work; > + > + work->obj = obj; > + drm_gem_object_reference(&obj->base); > + > + work->task = current; > + get_task_struct(work->task); > + > + INIT_WORK(&work->work, __i915_gem_userptr_get_pages_worker); > + schedule_work(&work->work); Any reason to use the system wq instead of the driver wq here? It doesn't look like it's the usual "takes modeset locks" justification. > + } else > + ret = -ENOMEM; > + } else { > + if (IS_ERR(obj->userptr.work)) { } else if (...) { ? > + ret = PTR_ERR(obj->userptr.work); > + obj->userptr.work = NULL; > + } > + } > + } > + } else { > + struct sg_table *st; > + > + st = kmalloc(sizeof(*st), GFP_KERNEL); > + if (st == NULL || sg_alloc_table(st, num_pages, GFP_KERNEL)) { > + kfree(st); > + ret = -ENOMEM; > + } else { > + struct scatterlist *sg; > + int n; > + > + for_each_sg(st->sgl, sg, num_pages, n) > + sg_set_page(sg, pvec[n], PAGE_SIZE, 0); > + > + obj->pages = st; > + obj->userptr.work = NULL; > + > + pinned = 0; > + ret = 0; > + } This block is almost identical to code in the worker. Would it be worth extracting the common parts into a helper? > + } > + > + release_pages(pvec, pinned, 0); > + drm_free_large(pvec); > + return ret; > +} > + > +static void > +i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj) > +{ > + struct scatterlist *sg; > + int i; > + > + if (obj->madv != I915_MADV_WILLNEED) > + obj->dirty = 0; This is subtly different than similar code in the standard put_pages() in that it sets dirty=0 for both DONTNEED and PURGED instead of just DONTNEED (w/ BUG_ON(PURGED)). I don't think we will ever actually truncate userptr objects, so is there any reason for this to be different? > + > + for_each_sg(obj->pages->sgl, sg, obj->pages->nents, i) { > + struct page *page = sg_page(sg); > + > + if (obj->dirty) > + set_page_dirty(page); > + > + mark_page_accessed(page); > + page_cache_release(page); > + } > + obj->dirty = 0; > + > + sg_free_table(obj->pages); > + kfree(obj->pages); > + > + BUG_ON(obj->userptr.work != NULL); > +} > + > +static void > +i915_gem_userptr_release(struct drm_i915_gem_object *obj) > +{ > + i915_gem_userptr_release__mmu_notifier(obj); > + > + if (obj->userptr.mm) { > + mmput(obj->userptr.mm); > + obj->userptr.mm = NULL; > + } > +} > + > +static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = { > + .get_pages = i915_gem_userptr_get_pages, > + .put_pages = i915_gem_userptr_put_pages, > + .release = i915_gem_userptr_release, > +}; > + > +/** > + * Creates a new mm object that wraps some normal memory from the process > + * context - user memory. > + * > + * We impose several restrictions upon the memory being mapped > + * into the GPU. > + * 1. It must be page aligned (both start/end addresses, i.e ptr and size). > + * 2. We only allow a bo as large as we could in theory map into the GTT, > + * that is we limit the size to the total size of the GTT. > + * 3. The bo is marked as being snoopable. The backing pages are left > + * accessible directly by the CPU, but reads by the GPU may incur the cost > + * of a snoop (unless you have an LLC architecture). No overlapping ranges - Brad > + * > + * Synchronisation between multiple users and the GPU is left to userspace > + * through the normal set-domain-ioctl. The kernel will enforce that the > + * GPU relinquishes the VMA before it is returned back to the system > + * i.e. upon free(), munmap() or process termination. However, the userspace > + * malloc() library may not immediately relinquish the VMA after free() and > + * instead reuse it whilst the GPU is still reading and writing to the VMA. > + * Caveat emptor. > + * > + * Also note, that the object created here is not currently a "first class" > + * object, in that several ioctls are banned. These are the CPU access > + * ioctls: mmap(), pwrite and pread. In practice, you are expected to use > + * direct access via your pointer rather than use those ioctls. > + * > + * If you think this is a good interface to use to pass GPU memory between > + * drivers, please use dma-buf instead. In fact, wherever possible use > + * dma-buf instead. > + */ > +int > +i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file) > +{ > + struct drm_i915_private *dev_priv = dev->dev_private; > + struct drm_i915_gem_userptr *args = data; > + struct drm_i915_gem_object *obj; > + int ret; > + u32 handle; > + > + if (args->flags & ~(I915_USERPTR_READ_ONLY | > + I915_USERPTR_UNSYNCHRONIZED)) > + return -EINVAL; > + > + if (offset_in_page(args->user_ptr | args->user_size)) > + return -EINVAL; > + > + if (args->user_size > dev_priv->gtt.base.total) > + return -E2BIG; > + > + if (!access_ok(args->flags & I915_USERPTR_READ_ONLY ? VERIFY_READ : VERIFY_WRITE, > + (char __user *)(unsigned long)args->user_ptr, args->user_size)) > + return -EFAULT; > + > + /* Allocate the new object */ > + obj = i915_gem_object_alloc(dev); > + if (obj == NULL) > + return -ENOMEM; > + > + drm_gem_private_object_init(dev, &obj->base, args->user_size); > + i915_gem_object_init(obj, &i915_gem_userptr_ops); > + obj->cache_level = I915_CACHE_LLC; > + obj->base.write_domain = I915_GEM_DOMAIN_CPU; > + obj->base.read_domains = I915_GEM_DOMAIN_CPU; > + > + obj->userptr.ptr = args->user_ptr; > + obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY); > + > + /* And keep a pointer to the current->mm for resolving the user pages > + * at binding. This means that we need to hook into the mmu_notifier > + * in order to detect if the mmu is destroyed. > + */ > + ret = -ENOMEM; > + if ((obj->userptr.mm = get_task_mm(current))) > + ret = i915_gem_userptr_init__mmu_notifier(obj, args->flags); > + if (ret == 0) > + ret = drm_gem_handle_create(file, &obj->base, &handle); > + > + /* drop reference from allocate - handle holds it now */ > + drm_gem_object_unreference_unlocked(&obj->base); > + if (ret) > + return ret; > + > + args->handle = handle; > + return 0; > +} > + > +int > +i915_gem_init_userptr(struct drm_device *dev) > +{ > +#if defined(CONFIG_MMU_NOTIFIER) > + struct drm_i915_private *dev_priv = to_i915(dev); > + hash_init(dev_priv->mmu_notifiers); > +#endif > + return 0; > +} > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index ae8cf61b8ce3..cce9f559e3d7 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -201,6 +201,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m, > err_puts(m, tiling_flag(err->tiling)); > err_puts(m, dirty_flag(err->dirty)); > err_puts(m, purgeable_flag(err->purgeable)); > + err_puts(m, err->userptr ? " userptr" : ""); > err_puts(m, err->ring != -1 ? " " : ""); > err_puts(m, ring_str(err->ring)); > err_puts(m, i915_cache_level_str(err->cache_level)); > @@ -584,6 +585,7 @@ static void capture_bo(struct drm_i915_error_buffer *err, > err->tiling = obj->tiling_mode; > err->dirty = obj->dirty; > err->purgeable = obj->madv != I915_MADV_WILLNEED; > + err->userptr = obj->userptr.mm != 0; > err->ring = obj->ring ? obj->ring->id : -1; > err->cache_level = obj->cache_level; > } > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h > index 37c8073a8246..6c145a0be250 100644 > --- a/include/uapi/drm/i915_drm.h > +++ b/include/uapi/drm/i915_drm.h > @@ -224,6 +224,7 @@ typedef struct _drm_i915_sarea { > #define DRM_I915_REG_READ 0x31 > #define DRM_I915_GET_RESET_STATS 0x32 > #define DRM_I915_GEM_CREATE2 0x33 > +#define DRM_I915_GEM_USERPTR 0x34 > > #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) > #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) > @@ -275,6 +276,7 @@ typedef struct _drm_i915_sarea { > #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy) > #define DRM_IOCTL_I915_REG_READ DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read) > #define DRM_IOCTL_I915_GET_RESET_STATS DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats) > +#define DRM_IOCTL_I915_GEM_USERPTR DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_USERPTR, struct drm_i915_gem_userptr) > > /* Allow drivers to submit batchbuffers directly to hardware, relying > * on the security mechanisms provided by hardware. > @@ -1129,4 +1131,18 @@ struct drm_i915_reset_stats { > __u32 pad; > }; > > +struct drm_i915_gem_userptr { > + __u64 user_ptr; > + __u64 user_size; > + __u32 flags; > +#define I915_USERPTR_READ_ONLY 0x1 > +#define I915_USERPTR_UNSYNCHRONIZED 0x80000000 > + /** > + * Returned handle for the object. > + * > + * Object handles are nonzero. > + */ > + __u32 handle; > +}; > + > #endif /* _UAPI_I915_DRM_H_ */ > -- > 1.8.5.3 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > http://lists.freedesktop.org/mailman/listinfo/intel-gfx > _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx