On Sun, Mar 15, 2020 at 9:34 AM Feng Li <lifeng1519@xxxxxxxxx> wrote: > > Hi Ming, > This is my cmd to run qemu: > qemu-2.12.0/x86_64-softmmu/qemu-system-x86_64 -enable-kvm -device > virtio-balloon -cpu host -smp 4 -m 2G -drive > file=/root/html/fedora-10g.img,format=raw,cache=none,aio=native,if=none,id=drive-virtio-disk1 > -device virtio-blk-pci,scsi=off,drive=drive-virtio-disk1,id=virtio-disk1,bootindex=1 > -drive file=/dev/sdb,format=raw,cache=none,aio=native,if=none,id=drive-virtio-disk2 > -device virtio-blk-pci,scsi=off,drive=drive-virtio-disk2,id=virtio-disk2,bootindex=2 > -device virtio-net,netdev=nw1,mac=00:11:22:EE:EE:10 -netdev > tap,id=nw1,script=no,downscript=no,ifname=tap0 -serial mon:stdio > -nographic -object > memory-backend-file,id=mem0,size=2G,mem-path=/dev/hugepages,share=on > -numa node,memdev=mem0 -vnc 0.0.0.0:100 -machine usb=on,nvdimm -device > usb-tablet -monitor unix:///tmp/a.socket,server,nowait -qmp > tcp:0.0.0.0:2234,server,nowait > > OS image is Fedora 31. Kernel is 5.3.7-301.fc31.x86_64. > > The address from virio in qemu like this: > ========= size: 262144, iovcnt: 64 > 0: size: 4096 addr: 0x7fffc83f1000 > 1: size: 4096 addr: 0x7fffc8037000 > 2: size: 4096 addr: 0x7fffd3710000 > 3: size: 4096 addr: 0x7fffd5624000 > 4: size: 4096 addr: 0x7fffc766c000 > 5: size: 4096 addr: 0x7fffc7c21000 > 6: size: 4096 addr: 0x7fffc8d54000 > 7: size: 4096 addr: 0x7fffc8fc6000 > 8: size: 4096 addr: 0x7fffd5659000 > 9: size: 4096 addr: 0x7fffc7f88000 > 10: size: 4096 addr: 0x7fffc767b000 > 11: size: 4096 addr: 0x7fffc8332000 > 12: size: 4096 addr: 0x7fffb4297000 > 13: size: 4096 addr: 0x7fffc8888000 > 14: size: 4096 addr: 0x7fffc93d7000 > 15: size: 4096 addr: 0x7fffc9f1f000 > > They are not contiguous pages, so the pages in bvec are not continus > physical pages. > > I don't know how to dump the bvec address in bio without recompiling the kernel. I just run similar test on 5.3.11-100.fc29.x86_64, and the observation is similar with yours. However, not observe similar problem in 5.6-rc kernel in VM, maybe kernel config causes the difference. BTW, I usually use the attached bcc script to observe bvec pages, and you may try that on upstream kernel. Thanks, Ming
#!/usr/bin/python3 # # bvec_pages.py # # Written as a basic example of a function pages per bvec distribution histogram. # # USAGE: bvec_pages # # The default interval is 5 seconds. A Ctrl-C will print the partially # gathered histogram then exit. # # Copyright (c) 2016 Ming Lei # Licensed under the Apache License, Version 2.0 (the "License") # # 15-Aug-2015 Ming Lei Created this. from bcc import BPF from ctypes import c_ushort, c_int, c_ulonglong from time import sleep from sys import argv import os # define BPF program bpf_text = """ #include <uapi/linux/ptrace.h> #include <linux/blkdev.h> struct key_t { unsigned dev_no; }; struct val_t { u64 bvec_cnt; u64 size; u64 bio_cnt; }; BPF_HASH(bvec, struct key_t, struct val_t); // time block I/O int trace_submit_bio(struct pt_regs *ctx, struct bio *bio) { unsigned short vcnt; unsigned size; size = bio->bi_iter.bi_size; vcnt = bio->bi_vcnt; if (vcnt) { struct val_t *valp; struct key_t key; struct val_t zero = {0}; #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) int maj, min; maj = bio->bi_disk->major; min = bio->bi_disk->first_minor; key.dev_no = (unsigned)MKDEV(maj, min); #else key.dev_no = (unsigned)bio->bi_bdev->bd_dev; #endif valp = bvec.lookup_or_init(&key, &zero); valp->bvec_cnt += vcnt; valp->size += size; valp->bio_cnt += 1; } //bpf_trace_printk("pages %d, vcnt: %d\\n", size>>12, vcnt); return 0; } """ # load BPF program b = BPF(text=bpf_text); b.attach_kprobe(event="submit_bio", fn_name="trace_submit_bio") # header print("Tracing... Hit Ctrl-C to end.") # output try: sleep(99999999) except KeyboardInterrupt: pass page_size = os.sysconf("SC_PAGE_SIZE") print("\n%-7s %-12s %12s %12s" % ("DEVICE", "PAGES_PER_BVEC", "SIZE_PER_BIO", "VCNT_PER_BIO")) counts = b.get_table("bvec") for k, v in counts.items(): pgs = v.size / page_size print("%-3d:%-3d %-12d %12dKB %12d" % (k.dev_no >> 20, k.dev_no & ((1 << 20) - 1), pgs / v.bvec_cnt, (v.size >> 10) / v.bio_cnt, v.bvec_cnt / v.bio_cnt))