Ok, now test for 10 minutes with the same I/O latencies . So there is no startup overhead. dl560g10spmem01:~ # perf record /usr/local/bin/fio --filename=/dev/dax0.0 --rw=randrw --refill_buffers --norandommap --randrepeat=0 --ioengine=mmap --bssplit=4k/4:8k/7:16k/7:32k/15:64k/65:128k/1:256k/1 --rwmixread=5 --iodepth=1 --numjobs=16 --runtime=600 --group_reporting --name=4-rand-rw-3xx --size=290g --numa_cpu_nodes=0 --time_based=1 4-rand-rw-3xx: (g=0): rw=randrw, bs=(R) 4096B-256KiB, (W) 4096B-256KiB, (T) 4096B-256KiB, ioengine=mmap, iodepth=1 ... fio-3.2-64-gaf7fb Starting 16 processes read: IOPS=36.7k, BW=1865MiB/s (1956MB/s)(1093GiB/600002msec) clat (nsec): min=120, max=16029k, avg=6897.44, stdev=5503.68 lat (nsec): min=144, max=16029k, avg=6932.35, stdev=5504.08 write: IOPS=697k, BW=34.6GiB/s (37.2GB/s)(20.3TiB/600002msec) clat (nsec): min=111, max=72027k, avg=12546.82, stdev=9845.85 lat (nsec): min=132, max=72027k, avg=12592.74, stdev=9851.98 Samples: 38M of event 'cycles:ppp', Event count (approx.): 182321387032131 Overhead Command Shared Object Symbol 77.20% fio fio [.] clock_thread_fn 8.32% fio libc-2.22.so [.] __memcpy_avx_unaligned 6.07% fio fio [.] fill_random_buf 4.82% fio [kernel.kallsyms] [k] page_remove_rmap 3.02% fio [kernel.kallsyms] [k] native_write_msr_safe 0.10% fio fio [.] get_io_u 88.99 │ lock cmpxchg %edi,(%rsi) │ clock_thread_fn(): │ seq = *t->seq; │ if (seq == UINT_MAX) │ break; │ __sync_synchronize(); │ tsc = get_cpu_clock(); │ } while (seq != atomic32_compare_and_swap(t->seq, seq, seq + 1)); │ cmp %eax,%ecx │ ↓ je 118 │ uint32_t seq; │ uint64_t tsc; │ │ c->cpu = t->cpu; │ do { │ seq = *t->seq; 0.01 │ c9: mov 0x68(%rbx),%rcx 10.57 │ mov (%rcx),%ecx │ if (seq == UINT_MAX) │ cmp $0xffffffff,%ecx The same for sequential write it's much faster, write latency dropped 12us -> 5us, results in bw. But still blocked on clock_thread_fn(), dl560g10spmem01:~ # perf record /usr/local/bin/fio --filename=/dev/dax0.0 --rw=write --refill_buffers --norandommap --randrepeat=0 --ioengine=mmap --bssplit=4k/4:8k/7:16k/7:32k/15:64k/65:128k/1:256k/1 --iodepth=1 --numjobs=16 --runtime=600 --group_reporting --name=4-rand-rw-3xx --size=290g --numa_cpu_nodes=0 --time_based=1 4-rand-rw-3xx: (g=0): rw=write, bs=(R) 4096B-256KiB, (W) 4096B-256KiB, (T) 4096B-256KiB, ioengine=mmap, iodepth=1 ... fio-3.2-64-gaf7fb Starting 16 processes write: IOPS=949k, BW=47.2GiB/s (50.6GB/s)(27.6TiB/600002msec) clat (nsec): min=214, max=36102k, avg=7406.20, stdev=5435.29 lat (nsec): min=233, max=36102k, avg=7442.12, stdev=5439.69 Samples: 38M of event 'cycles:ppp', Event count (approx.): 240873167676034 Overhead Command Shared Object Symbol 85.36% fio fio [.] clock_thread_fn 6.26% fio fio [.] fill_random_buf 5.07% fio libc-2.22.so [.] __memcpy_avx_unaligned 2.74% fio [kernel.kallsyms] [k] radix_tree_next_chunk 91.46 │ lock cmpxchg %edi,(%rsi) │ clock_thread_fn(): │ seq = *t->seq; │ if (seq == UINT_MAX) │ break; │ __sync_synchronize(); │ tsc = get_cpu_clock(); │ } while (seq != atomic32_compare_and_swap(t->seq, seq, seq + 1)); │ cmp %eax,%ecx │ ↓ je 118 │ uint32_t seq; │ uint64_t tsc; │ │ c->cpu = t->cpu; │ do { │ seq = *t->seq; 0.01 │ c9: mov 0x68(%rbx),%rcx 8.10 │ mov (%rcx),%ecx Run again, the same results. Is it possible to break lat (nsec) down ? Anton -----Original Message----- From: Robert Elliott (Persistent Memory) Sent: Monday, December 04, 2017 7:31 PM To: Gavriliuk, Anton (HPS Ukraine) <anton.gavriliuk@xxxxxxx>; Jens Axboe <axboe@xxxxxxxxx>; Rebecca Cran <rebecca@xxxxxxxxxxxx>; Sitsofe Wheeler <sitsofe@xxxxxxxxx> Cc: fio@xxxxxxxxxxxxxxx; Kani, Toshimitsu <toshi.kani@xxxxxxx> Subject: RE: fio 3.2 > -----Original Message----- > From: Gavriliuk, Anton (HPS Ukraine) > Sent: Monday, December 4, 2017 11:12 AM ... > I have run next command several times, > > perf record /usr/local/bin/fio --filename=/dev/dax0.0 --rw=randrw -- > refill_buffers --norandommap --randrepeat=0 --ioengine=mmap -- > bssplit=4k/4:8k/7:16k/7:32k/15:64k/65:128k/1:256k/1 --rwmixread=5 -- > iodepth=1 --numjobs=16 --runtime=1800 --group_reporting > --name=4-rand-rw- 3xx --size=290g --numa_cpu_nodes=0 > > each perf report shows me toppest clock_thread_fn(), > > Samples: 8M of event 'cycles:ppp', Event count (approx.): 23391179916235 > Overhead Command Shared Object Symbol > 75.27% fio fio [.] clock_thread_fn > 13.68% fio libc-2.22.so [.] __memcpy_avx_unaligned > 10.05% fio fio [.] fill_random_buf This just runs one pass through memory, so the startup overhead is magnified. Add --time_based=1 so it really runs for 1800 s. ��.n��������+%������w��{.n�������^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�