Quoting Tvrtko Ursulin (2020-05-04 13:58:46) > > On 04/05/2020 05:48, Chris Wilson wrote: > > +static bool can_store_dword(const struct intel_engine_cs *engine) > > +{ > > + return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6); > > +} > > A bit confusing to future reader who it differs from > intel_engine_can_store_dword but apart from adding a comment I don't > have any better ideas. can_use_engine_for_reloc_without_killing_the_machine() if (!engine_can_reloc_gpu()) ? > > > + > > static u32 *reloc_gpu(struct i915_execbuffer *eb, > > struct i915_vma *vma, > > unsigned int len) > > @@ -1387,9 +1392,9 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb, > > if (unlikely(!cache->rq)) { > > struct intel_engine_cs *engine = eb->engine; > > > > - if (!intel_engine_can_store_dword(engine)) { > > + if (!can_store_dword(engine)) { > > engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0]; > > - if (!engine || !intel_engine_can_store_dword(engine)) > > + if (!engine) > > return ERR_PTR(-ENODEV); > > } > > > > @@ -1435,91 +1440,138 @@ static inline bool use_reloc_gpu(struct i915_vma *vma) > > return !dma_resv_test_signaled_rcu(vma->resv, true); > > } > > > > -static u64 > > -relocate_entry(struct i915_vma *vma, > > - const struct drm_i915_gem_relocation_entry *reloc, > > - struct i915_execbuffer *eb, > > - const struct i915_vma *target) > > +static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset) > > { > > - u64 offset = reloc->offset; > > - u64 target_offset = relocation_target(reloc, target); > > - bool wide = eb->reloc_cache.use_64bit_reloc; > > - void *vaddr; > > + struct page *page; > > + unsigned long addr; > > > > - if (!eb->reloc_cache.vaddr && use_reloc_gpu(vma)) { > > - const unsigned int gen = eb->reloc_cache.gen; > > - unsigned int len; > > - u32 *batch; > > - u64 addr; > > + GEM_BUG_ON(vma->pages != vma->obj->mm.pages); > > > > - if (wide) > > - len = offset & 7 ? 8 : 5; > > - else if (gen >= 4) > > - len = 4; > > - else > > - len = 3; > > + page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT); > > + addr = PFN_PHYS(page_to_pfn(page)); > > + GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */ > > > > - batch = reloc_gpu(eb, vma, len); > > - if (IS_ERR(batch)) > > - goto repeat; > > + return addr + offset_in_page(offset); > > +} > > + > > +static bool __reloc_entry_gpu(struct i915_vma *vma, > > + struct i915_execbuffer *eb, > > + u64 offset, > > + u64 target_addr) > > +{ > > + const unsigned int gen = eb->reloc_cache.gen; > > + unsigned int len; > > + u32 *batch; > > + u64 addr; > > + > > + if (gen >= 8) > > + len = offset & 7 ? 8 : 5; > > This used to be driven of eb->reloc_cache.use_64bit_reloc, any practical > effect of the change? Doesn't seem to.. Should you still use it for > consistency though? It used to be because we already pulled it into a local wide. I switched to gen here for consistency with the if-else ladders. > > > + else if (gen >= 4) > > + len = 4; > > + else > > + len = 3; > > + > > + batch = reloc_gpu(eb, vma, len); > > + if (IS_ERR(batch)) > > + return false; > > + > > + addr = gen8_canonical_addr(vma->node.start + offset); > > + if (gen >= 8) { > > + if (offset & 7) { > > + *batch++ = MI_STORE_DWORD_IMM_GEN4; > > + *batch++ = lower_32_bits(addr); > > + *batch++ = upper_32_bits(addr); > > + *batch++ = lower_32_bits(target_addr); > > + > > + addr = gen8_canonical_addr(addr + 4); > > > > - addr = gen8_canonical_addr(vma->node.start + offset); > > - if (wide) { > > - if (offset & 7) { > > - *batch++ = MI_STORE_DWORD_IMM_GEN4; > > - *batch++ = lower_32_bits(addr); > > - *batch++ = upper_32_bits(addr); > > - *batch++ = lower_32_bits(target_offset); > > - > > - addr = gen8_canonical_addr(addr + 4); > > - > > - *batch++ = MI_STORE_DWORD_IMM_GEN4; > > - *batch++ = lower_32_bits(addr); > > - *batch++ = upper_32_bits(addr); > > - *batch++ = upper_32_bits(target_offset); > > - } else { > > - *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; > > - *batch++ = lower_32_bits(addr); > > - *batch++ = upper_32_bits(addr); > > - *batch++ = lower_32_bits(target_offset); > > - *batch++ = upper_32_bits(target_offset); > > - } > > - } else if (gen >= 6) { > > *batch++ = MI_STORE_DWORD_IMM_GEN4; > > - *batch++ = 0; > > - *batch++ = addr; > > - *batch++ = target_offset; > > - } else if (gen >= 4) { > > - *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; > > - *batch++ = 0; > > - *batch++ = addr; > > - *batch++ = target_offset; > > + *batch++ = lower_32_bits(addr); > > + *batch++ = upper_32_bits(addr); > > + *batch++ = upper_32_bits(target_addr); > > } else { > > - *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; > > - *batch++ = addr; > > - *batch++ = target_offset; > > + *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1; > > + *batch++ = lower_32_bits(addr); > > + *batch++ = upper_32_bits(addr); > > + *batch++ = lower_32_bits(target_addr); > > + *batch++ = upper_32_bits(target_addr); > > } > > - > > - goto out; > > + } else if (gen >= 6) { > > + *batch++ = MI_STORE_DWORD_IMM_GEN4; > > + *batch++ = 0; > > + *batch++ = addr; > > + *batch++ = target_addr; > > + } else if (IS_I965G(eb->i915)) { > > + *batch++ = MI_STORE_DWORD_IMM_GEN4; > > + *batch++ = 0; > > + *batch++ = vma_phys_addr(vma, offset); > > + *batch++ = target_addr; > > + } else if (gen >= 4) { > > + *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; > > + *batch++ = 0; > > + *batch++ = addr; > > + *batch++ = target_addr; > > + } else if (gen >= 3 && > > + !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) { > > + *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; > > + *batch++ = addr; > > + *batch++ = target_addr; > > + } else { > > + *batch++ = MI_STORE_DWORD_IMM; > > + *batch++ = vma_phys_addr(vma, offset); > > + *batch++ = target_addr; > > } > > > > + return true; > > +} > > + > > +static bool reloc_entry_gpu(struct i915_vma *vma, > > + struct i915_execbuffer *eb, > > + u64 offset, > > + u64 target_addr) > > +{ > > + if (eb->reloc_cache.vaddr) > > + return false; > > + > > + if (!use_reloc_gpu(vma)) > > + return false; > > + > > + return __reloc_entry_gpu(vma, eb, offset, target_addr); > > +} > > + > > +static u64 > > +relocate_entry(struct i915_vma *vma, > > + const struct drm_i915_gem_relocation_entry *reloc, > > + struct i915_execbuffer *eb, > > + const struct i915_vma *target) > > +{ > > + u64 target_addr = relocation_target(reloc, target); > > + u64 offset = reloc->offset; > > + > > + if (!reloc_entry_gpu(vma, eb, offset, target_addr)) { > > + bool wide = eb->reloc_cache.use_64bit_reloc; > > + void *vaddr; > > + > > repeat: > > - vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT); > > - if (IS_ERR(vaddr)) > > - return PTR_ERR(vaddr); > > + vaddr = reloc_vaddr(vma->obj, > > + &eb->reloc_cache, > > + offset >> PAGE_SHIFT); > > + if (IS_ERR(vaddr)) > > + return PTR_ERR(vaddr); > > > > - clflush_write32(vaddr + offset_in_page(offset), > > - lower_32_bits(target_offset), > > - eb->reloc_cache.vaddr); > > + GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32))); > > + clflush_write32(vaddr + offset_in_page(offset), > > + lower_32_bits(target_addr), > > + eb->reloc_cache.vaddr); > > > > - if (wide) { > > - offset += sizeof(u32); > > - target_offset >>= 32; > > - wide = false; > > - goto repeat; > > + if (wide) { > > + offset += sizeof(u32); > > + target_addr >>= 32; > > + wide = false; > > + goto repeat; > > + } > > } > > > > -out: > > return target->node.start | UPDATE; > > } > > > > @@ -3022,3 +3074,7 @@ end:; > > kvfree(exec2_list); > > return err; > > } > > The diff is a bit messy but looks okay and in CI we trust. > > > + > > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > > +#include "selftests/i915_gem_execbuffer.c" > > +#endif > > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c > > new file mode 100644 > > index 000000000000..985f9fbd0ba0 > > --- /dev/null > > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c > > @@ -0,0 +1,206 @@ > > +// SPDX-License-Identifier: MIT > > +/* > > + * Copyright © 2020 Intel Corporation > > + */ > > + > > +#include "i915_selftest.h" > > + > > +#include "gt/intel_engine_pm.h" > > +#include "selftests/igt_flush_test.h" > > + > > +static void hexdump(const void *buf, size_t len) > > Uh-oh seems to be the third copy! ;) * jedi handwave Yeah, I'm close to pulling them into i915_selftests.c as pr_hexdump(). A certain Tvrtko might one day win his argument to land the wrapper in lib/ > > +static int __igt_gpu_reloc(struct i915_execbuffer *eb, > > + struct drm_i915_gem_object *obj) > > +{ > > + enum { > > + X = 0, > > + Y = 3, > > + Z = 8 > > + }; > > + const u64 mask = GENMASK_ULL(eb->reloc_cache.gen >= 8 ? 63 : 31, 0); > > Mask is to remove the poison? use_64_bit relocs instead of gen, or this > is more correct? Might as well just use_64_bit_relocs. I suppose doing a manual check against gen might help notice a use_64_bit_relocs slip? > > > + const u32 *map = page_mask_bits(obj->mm.mapping); > > + struct i915_request *rq; > > + struct i915_vma *vma; > > + int inc; > > + int err; > > + > > + vma = i915_vma_instance(obj, eb->context->vm, NULL); > > + if (IS_ERR(vma)) > > + return PTR_ERR(vma); > > + > > + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH); > > + if (err) > > + return err; > > + > > + /* 8-Byte aligned */ > > + if (!__reloc_entry_gpu(vma, eb, X * sizeof(u32), X)) { > > + err = -EIO; > > + goto unpin_vma; > > + } > > + > > + /* !8-Byte aligned */ > > What is the significance of the non 8 byte aligned? That's the if(offset & 7) path in the gen8 code. If the offset is 8-byte aligned we can do a QWord MI_STORE_DATA_IMM, if it is only 4-byte aligned, we need 2 MI_STORE_DATA_IMM. > > > + if (!__reloc_entry_gpu(vma, eb, Y * sizeof(u32), Y)) { > > + err = -EIO; > > + goto unpin_vma; > > + } > > + > > + /* Skip to the end of the cmd page */ > > + inc = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1; > > + inc -= eb->reloc_cache.rq_size; > > + memset(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size, > > + 0, inc * sizeof(u32)); > > + eb->reloc_cache.rq_size += inc; > > + > > + /* Force batch chaining */ > > + if (!__reloc_entry_gpu(vma, eb, Z * sizeof(u32), Z)) { > > + err = -EIO; > > + goto unpin_vma; > > + } > > + > > + GEM_BUG_ON(!eb->reloc_cache.rq); > > + rq = i915_request_get(eb->reloc_cache.rq); > > + err = reloc_gpu_flush(&eb->reloc_cache); > > + if (err) > > + goto put_rq; > > + GEM_BUG_ON(eb->reloc_cache.rq); > > + > > + err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2); > > + if (err) { > > + intel_gt_set_wedged(eb->engine->gt); > > + goto put_rq; > > + } > > + > > + if (!i915_request_completed(rq)) { > > + pr_err("%s: did not wait for relocations!\n", eb->engine->name); > > + err = -EINVAL; > > + goto put_rq; > > + } > > + > > + if (read_reloc(map, X, mask) != X) { > > + pr_err("%s[X]: map[%d] %llx != %x\n", > > + eb->engine->name, X, read_reloc(map, X, mask), X); > > + err = -EINVAL; > > + } > > + > > + if (read_reloc(map, Y, mask) != Y) { > > + pr_err("%s[Y]: map[%d] %llx != %x\n", > > + eb->engine->name, Y, read_reloc(map, Y, mask), Y); > > + err = -EINVAL; > > + } > > + > > + if (read_reloc(map, Z, mask) != Z) { > > + pr_err("%s[Z]: map[%d] %llx != %x\n", > > + eb->engine->name, Z, read_reloc(map, Z, mask), Z); > > + err = -EINVAL; > > + } > > Why this couldn't just be an array of [0, 3, 8] instead of enums and > then all these tests could be a single loop? I can't figure out what is > the benefit of enum in other words.. Okay in the test phase it can't be > a simple loop since it needs the special case for last iteration, but > here it could be. If you saw how far this came from the first version... I just liked X, Y, Z as labels. -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx