On Thu, Mar 02, 2023 at 09:34:22AM +0100, Thomas Hellström wrote: > Add a suballocator test to get some test coverage for the new drm > suballocator, and perform some basic timing (elapsed time). > > Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/Kconfig | 1 + > drivers/gpu/drm/tests/Makefile | 3 +- > drivers/gpu/drm/tests/drm_suballoc_test.c | 356 ++++++++++++++++++++++ > 3 files changed, 359 insertions(+), 1 deletion(-) > create mode 100644 drivers/gpu/drm/tests/drm_suballoc_test.c > > diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig > index 8fbe57407c60..dced53723721 100644 > --- a/drivers/gpu/drm/Kconfig > +++ b/drivers/gpu/drm/Kconfig > @@ -78,6 +78,7 @@ config DRM_KUNIT_TEST > select DRM_LIB_RANDOM > select DRM_KMS_HELPER > select DRM_BUDDY > + select DRM_SUBALLOC_HELPER > select DRM_EXPORT_FOR_TESTS if m > select DRM_KUNIT_TEST_HELPERS > default KUNIT_ALL_TESTS > diff --git a/drivers/gpu/drm/tests/Makefile b/drivers/gpu/drm/tests/Makefile > index bca726a8f483..c664944a48ab 100644 > --- a/drivers/gpu/drm/tests/Makefile > +++ b/drivers/gpu/drm/tests/Makefile > @@ -17,6 +17,7 @@ obj-$(CONFIG_DRM_KUNIT_TEST) += \ > drm_modes_test.o \ > drm_plane_helper_test.o \ > drm_probe_helper_test.o \ > - drm_rect_test.o > + drm_rect_test.o \ > + drm_suballoc_test.o > > CFLAGS_drm_mm_test.o := $(DISABLE_STRUCTLEAK_PLUGIN) > diff --git a/drivers/gpu/drm/tests/drm_suballoc_test.c b/drivers/gpu/drm/tests/drm_suballoc_test.c > new file mode 100644 > index 000000000000..e7303a5505a0 > --- /dev/null > +++ b/drivers/gpu/drm/tests/drm_suballoc_test.c > @@ -0,0 +1,356 @@ > +// SPDX-License-Identifier: GPL-2.0 OR MIT > +/* > + * Test case for the drm_suballoc suballocator manager > + * Copyright 2023 Intel Corporation. > + */ > + > +#include <kunit/test.h> > + > +#include <linux/dma-fence.h> > +#include <linux/ktime.h> > +#include <linux/hrtimer.h> > +#include <linux/sizes.h> > +#include <linux/slab.h> > +#include <linux/spinlock.h> > +#include <linux/delay.h> > +#include <drm/drm_suballoc.h> > + > +#define SA_ITERATIONS 10000 > +#define SA_SIZE SZ_1M > +#define SA_DEFAULT_ALIGN SZ_4K > + > +static bool intr = true; > +static bool from_reclaim; > +static bool pre_throttle; > +static unsigned int num_rings = 4; > +static unsigned int iterations = SA_ITERATIONS; > + > +static atomic64_t free_space; > + > +static atomic_t other_id; > + > +struct suballoc_fence; > + > +/** > + * struct suballoc_ring - fake gpu engine. > + * @list: List of fences to signal. > + * @signal_time: Accumulated fence signal execution time. > + * @lock: Protects the suballoc ring members. hardirq safe. > + * @hrtimer: Fake execution time timer. > + * @active: The currently active fence for which we have pending work or a > + * timer running. > + * @seqno: Fence submissin seqno. > + * @idx: Index for calculation of fake execution time. > + * @work: Work struct used solely to move the timer start to a different > + * processor than that used for submission. > + */ > +struct suballoc_ring { > + ktime_t signal_time; > + struct list_head list; > + /* Protect the ring processing. */ > + spinlock_t lock; > + struct hrtimer hrtimer; > + struct suballoc_fence *active; > + atomic64_t seqno; > + u32 idx; > + struct work_struct work; > +}; > + > +/** > + * struct suballoc_fence - Hrtimer-driven fence. > + * @fence: The base class fence struct. > + * @link: Link for the ring's fence list. > + * @size: The size of the suballocator range associated with this fence. > + * @id: Cpu id likely used by the submission thread for suballoc allocation. > + */ > +struct suballoc_fence { > + struct dma_fence fence; > + struct list_head link; > + size_t size; > + unsigned int id; > +}; > + > +/* A varying but repeatable fake execution time */ > +static ktime_t ring_next_delay(struct suballoc_ring *ring) > +{ > + return ns_to_ktime((u64)(++ring->idx % 8) * 200 * NSEC_PER_USEC); > +} Is there any way we can avoid using time (and large number of iterations) here, while keeping the coverage? drm_suballoc have longest runtime out of all tests in DRM (taking ~60% of the whole DRM kunit execution, drm_mm being the second and taking ~35%, without those two suites DRM tests execute in milliseconds rather than tens of seconds), Building test cases in a way that operate on time basis makes it tricky to optimize the runtime. If we extract various parameters from modparams to separate test cases, it's going to get even worse. > + > +/* > + * Launch from a work item to decrease the likelyhood of the timer expiry > + * callback getting called from the allocating cpu. > + * We want to trigger cache-line bouncing between allocating and signalling > + * cpus. > + */ > +static void ring_launch_timer_work(struct work_struct *work) > +{ > + struct suballoc_ring *ring = > + container_of(work, typeof(*ring), work); > + > + spin_lock_irq(&ring->lock); > + if (ring->active) > + hrtimer_start_range_ns(&ring->hrtimer, ring_next_delay(ring), > + 100ULL * NSEC_PER_USEC, > + HRTIMER_MODE_REL_PINNED); > + > + spin_unlock_irq(&ring->lock); > +} > + > +/* > + * Signal an active fence and pull the next off the list if any and make it > + * active. > + */ > +static enum hrtimer_restart ring_hrtimer_expired(struct hrtimer *hrtimer) > +{ > + struct suballoc_ring *ring = > + container_of(hrtimer, typeof(*ring), hrtimer); > + struct suballoc_fence *sfence; > + ktime_t now, then; > + unsigned long irqflags; > + > + spin_lock_irqsave(&ring->lock, irqflags); > + sfence = ring->active; > + > + if (sfence) { > + struct dma_fence *fence = &sfence->fence; > + > + if (sfence->id != get_cpu()) > + atomic_inc(&other_id); > + put_cpu(); > + > + then = ktime_get(); > + dma_fence_signal(fence); > + now = ktime_get(); > + dma_fence_put(fence); > + ring->signal_time = ktime_add(ring->signal_time, > + ktime_sub(now, then)); > + ring->active = NULL; > + atomic64_add(sfence->size, &free_space); > + } > + > + sfence = list_first_entry_or_null(&ring->list, typeof(*sfence), link); > + if (sfence) { > + list_del_init(&sfence->link); > + ring->active = sfence; > + spin_unlock_irqrestore(&ring->lock, irqflags); > + hrtimer_forward_now(&ring->hrtimer, ring_next_delay(ring)); > + return HRTIMER_RESTART; > + } > + spin_unlock_irqrestore(&ring->lock, irqflags); > + > + return HRTIMER_NORESTART; > +} > + > +/* > + * Queue a fence on a ring and if it's the first fence, make it active. > + */ > +static void ring_add_fence(struct suballoc_ring *ring, > + struct suballoc_fence *sfence) > +{ > + spin_lock_irq(&ring->lock); > + if (!ring->active) { > + ring->active = sfence; > + queue_work(system_unbound_wq, &ring->work); > + } else { > + list_add_tail(&sfence->link, &ring->list); > + } > + spin_unlock_irq(&ring->lock); > +} > + > +static void ring_init(struct suballoc_ring *ring) > +{ > + memset(ring, 0, sizeof(*ring)); > + INIT_LIST_HEAD(&ring->list); > + spin_lock_init(&ring->lock); > + hrtimer_init(&ring->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); > + ring->hrtimer.function = ring_hrtimer_expired; > + INIT_WORK(&ring->work, ring_launch_timer_work); > +} > + > +static bool ring_idle(struct suballoc_ring *ring) > +{ > + bool tmp; > + > + spin_lock_irq(&ring->lock); > + tmp = !ring->active; > + spin_unlock_irq(&ring->lock); > + > + return tmp; > +} > + > +static const char *dma_fence_get_suballoc_name(struct dma_fence *fence) > +{ > + return "suballoc"; > +} > + > +static const struct dma_fence_ops dma_fence_suballoc_ops = { > + .get_driver_name = dma_fence_get_suballoc_name, > + .get_timeline_name = dma_fence_get_suballoc_name, > +}; > + > +static DEFINE_SPINLOCK(sa_fence_lock); > +static ktime_t alloctime, freetime; > + > +static void drm_test_suballoc(struct kunit *test) > +{ > + struct suballoc_ring *rings; > + struct drm_suballoc_manager sa_manager; > + struct drm_suballoc *sa; > + struct suballoc_fence *sfence; > + struct dma_fence *fence; > + ktime_t then, now, signaltime; > + int i, ring, iter_tot = 0; > + size_t size; > + unsigned int align; > + unsigned long long soffset; > + gfp_t gfp; > + > + rings = kvmalloc_array(num_rings, sizeof(*rings), GFP_KERNEL); > + if (!rings) { > + KUNIT_FAIL(test, "Failed allocating %u rings.\n"); > + return; > + } KUNIT_ASSERT_NOT_NULL? Though we might want to implement a test-resource managed variant (kunit_kvmalloc_array) to not have to worry about lifecycle and freeing the resources. > + > + for (i = 0; i < num_rings; ++i) > + ring_init(rings + i); With resource managed - rings could be allocated and initialized at .init(). We would then call the flush and wait at .exit(), and as a result, we would be able to use asserts in test body without worrying about leaking. > + > + atomic64_set(&free_space, SA_SIZE); > + drm_suballoc_manager_init(&sa_manager, SA_SIZE, SA_DEFAULT_ALIGN); This could also be moved to .init() > + > + if (from_reclaim) > + gfp = GFP_NOWAIT | __GFP_NOWARN; > + else > + gfp = GFP_KERNEL; > + > + for (i = 0; i < iterations; ++i) { > + ring = i % num_rings; > + size = (ring + 1) * SZ_4K; > + align = 1 << (ring % const_ilog2(SA_DEFAULT_ALIGN)); > + > + if (pre_throttle) > + while (atomic64_read(&free_space) < SA_SIZE / 2) > + cpu_relax(); > + > + if (from_reclaim) > + fs_reclaim_acquire(GFP_KERNEL); > + > + then = ktime_get(); > + sa = drm_suballoc_new(&sa_manager, size, gfp, intr, align); > + now = ktime_get(); > + if (from_reclaim) > + fs_reclaim_release(GFP_KERNEL); > + > + alloctime = ktime_add(alloctime, ktime_sub(now, then)); > + > + iter_tot++; > + if (IS_ERR(sa)) { KUNIT_ASSERT_NOT_ERR_OR_NULL? > + if (from_reclaim) { drm_suballoc_new can fail for other reasons than -ENOMEM under memory pressure, while with from_reclaim we're treating all errors as a success, is that intentional? > + iter_tot--; > + continue; > + } > + > + KUNIT_FAIL(test, "drm_suballoc_new() returned %pe\n", > + sa); > + break; > + } > + > + atomic64_sub(size, &free_space); > + soffset = drm_suballoc_soffset(sa); > + if (!IS_ALIGNED(soffset, align)) { > + drm_suballoc_free(sa, NULL); Do we need to worry about calling free here? We shouldn't leak as long as we wait upon all fences, as drm_suballoc_manager_fini will do the clean up for us. KUNIT_EXPECT_TRUE_MSG(..., IS_ALIGNED(soffset, align), ...)? > + KUNIT_FAIL(test, "Incorrect alignment: offset %llu align %u rem %llu\n", > + soffset, align, soffset & (align - 1)); > + break; > + } > + > + if (drm_suballoc_eoffset(sa) > SA_SIZE) { > + drm_suballoc_free(sa, NULL); > + KUNIT_FAIL(test, "Allocation beyond end.\n"); > + break; > + } KUNIT_EXPECT_LE_MSG? > + > + if (drm_suballoc_size(sa) < size || > + drm_suballoc_size(sa) >= size + align) { > + drm_suballoc_free(sa, NULL); > + KUNIT_FAIL(test, "Incorrect size.\n"); > + break; > + } KUNIT_EXPECT_GE and KUNIT_EXPECT_LT? > + > + sfence = kmalloc(sizeof(*sfence), GFP_KERNEL); > + if (unlikely(!sfence)) { > + drm_suballoc_free(sa, NULL); > + KUNIT_FAIL(test, "Fence allocation failed.\n"); > + break; > + } It looks like sfence is never released. kunit_kmalloc? KUNIT_ASSERT_NOT_NULL / KUNIT_ASSERT_NOT_ERR_OR_NULL? > + fence = &sfence->fence; > + dma_fence_init(fence, &dma_fence_suballoc_ops, &sa_fence_lock, > + ring + 1, > + atomic64_inc_return(&rings[ring].seqno)); > + sfence->size = size; > + sfence->id = get_cpu(); > + put_cpu(); > + > + ring_add_fence(rings + ring, sfence); > + > + then = ktime_get(); > + drm_suballoc_free(sa, fence); > + now = ktime_get(); > + freetime = ktime_add(freetime, ktime_sub(now, then)); > + } > + > + signaltime = ktime_set(0, 0); > + for (i = 0; i < num_rings; ++i) { > + struct suballoc_ring *sring = &rings[i]; > + > + flush_work(&sring->work); > + while (!ring_idle(sring)) > + schedule(); > + signaltime = ktime_add(signaltime, sring->signal_time); > + } This (and drm_suballoc_manager_fini) could be moved to .exit() > + > + kvfree(rings); > + > + kunit_info(test, "signals on different processor: %d of %d\n", > + atomic_read(&other_id), iter_tot); > + drm_suballoc_manager_fini(&sa_manager); > + kunit_info(test, "Alloc time was %llu ns.\n", (unsigned long long) > + ktime_to_ns(alloctime) / iter_tot); > + kunit_info(test, "Free time was %llu ns.\n", (unsigned long long) > + ktime_to_ns(freetime) / iter_tot); > + kunit_info(test, "Signal time was %llu ns.\n", (unsigned long long) > + ktime_to_ns(signaltime) / iter_tot); Do we need those timings? If we do expect certain values (probably with some epsilon range), we should handle it as a separate test. > + > + if (atomic64_read(&free_space) != SA_SIZE) { > + kunit_warn(test, "Test sanity check failed.\n"); > + kunit_warn(test, "Space left at exit is %lld of %d\n", > + (long long)atomic64_read(&free_space), SA_SIZE); > + } If this is an error - let's add it as an "expect". Otherwise it's not printed if the test PASSes (unless we're running with raw output). > +} > + > +module_param(intr, bool, 0400); > +MODULE_PARM_DESC(intr, "Whether to wait interruptible for space."); This should be a separate test case (or param to a test case), not a modparam. > +module_param(from_reclaim, bool, 0400); > +MODULE_PARM_DESC(from_reclaim, "Whether to suballocate from reclaim context."); Same here. > +module_param(pre_throttle, bool, 0400); > +MODULE_PARM_DESC(pre_throttle, "Whether to have the test throttle for space " > + "before allocations."); And here. > +module_param(num_rings, uint, 0400); > +MODULE_PARM_DESC(num_rings, "Number of rings signalling fences in order.\n"); > +module_param(iterations, uint, 0400); > +MODULE_PARM_DESC(iterations, "Number of allocations to perform.\n"); Do we expect any difference in coverage for different number of rings / iterations? What's the relation here? Would it be possible to extract specific values (for which we expect different behavior) to separate testcases? -Michał > + > +static struct kunit_case drm_suballoc_tests[] = { > + KUNIT_CASE(drm_test_suballoc), > + {} > +}; > + > +static struct kunit_suite drm_suballoc_test_suite = { > + .name = "drm_suballoc", > + .test_cases = drm_suballoc_tests, > +}; > + > +kunit_test_suite(drm_suballoc_test_suite); > + > +MODULE_AUTHOR("Intel Corporation"); > +MODULE_DESCRIPTION("DRM suballocator Kunit test"); > +MODULE_LICENSE("Dual MIT/GPL"); > -- > 2.34.1 >