Re: [Question] IO is split by block layer when size is larger than 4k

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Mar 15, 2020 at 9:34 AM Feng Li <lifeng1519@xxxxxxxxx> wrote:
>
> Hi Ming,
> This is my cmd to run qemu:
> qemu-2.12.0/x86_64-softmmu/qemu-system-x86_64 -enable-kvm -device
> virtio-balloon -cpu host -smp 4 -m 2G -drive
> file=/root/html/fedora-10g.img,format=raw,cache=none,aio=native,if=none,id=drive-virtio-disk1
> -device virtio-blk-pci,scsi=off,drive=drive-virtio-disk1,id=virtio-disk1,bootindex=1
> -drive file=/dev/sdb,format=raw,cache=none,aio=native,if=none,id=drive-virtio-disk2
> -device virtio-blk-pci,scsi=off,drive=drive-virtio-disk2,id=virtio-disk2,bootindex=2
> -device virtio-net,netdev=nw1,mac=00:11:22:EE:EE:10 -netdev
> tap,id=nw1,script=no,downscript=no,ifname=tap0 -serial mon:stdio
> -nographic -object
> memory-backend-file,id=mem0,size=2G,mem-path=/dev/hugepages,share=on
> -numa node,memdev=mem0 -vnc 0.0.0.0:100 -machine usb=on,nvdimm -device
> usb-tablet -monitor unix:///tmp/a.socket,server,nowait -qmp
> tcp:0.0.0.0:2234,server,nowait
>
> OS image is Fedora 31. Kernel is 5.3.7-301.fc31.x86_64.
>
> The address from virio in qemu like this:
> ========= size: 262144, iovcnt: 64
>       0: size: 4096 addr: 0x7fffc83f1000
>       1: size: 4096 addr: 0x7fffc8037000
>       2: size: 4096 addr: 0x7fffd3710000
>       3: size: 4096 addr: 0x7fffd5624000
>       4: size: 4096 addr: 0x7fffc766c000
>       5: size: 4096 addr: 0x7fffc7c21000
>       6: size: 4096 addr: 0x7fffc8d54000
>       7: size: 4096 addr: 0x7fffc8fc6000
>       8: size: 4096 addr: 0x7fffd5659000
>       9: size: 4096 addr: 0x7fffc7f88000
>       10: size: 4096 addr: 0x7fffc767b000
>       11: size: 4096 addr: 0x7fffc8332000
>       12: size: 4096 addr: 0x7fffb4297000
>       13: size: 4096 addr: 0x7fffc8888000
>       14: size: 4096 addr: 0x7fffc93d7000
>       15: size: 4096 addr: 0x7fffc9f1f000
>
> They are not contiguous pages, so the pages in bvec are not continus
> physical pages.
>
> I don't know how to dump the bvec address in bio without recompiling the kernel.

I just run similar test on 5.3.11-100.fc29.x86_64, and the observation
is similar with
yours.

However, not observe similar problem in 5.6-rc kernel in VM, maybe kernel config
causes the difference.

BTW, I usually use the attached bcc script to observe bvec pages, and you may
try that on upstream kernel.

Thanks,
Ming
#!/usr/bin/python3
#
# bvec_pages.py
#
# Written as a basic example of a function pages per bvec distribution histogram.
#
# USAGE: bvec_pages
#
# The default interval is 5 seconds. A Ctrl-C will print the partially
# gathered histogram then exit.
#
# Copyright (c) 2016 Ming Lei
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 15-Aug-2015	Ming Lei	Created this.

from bcc import BPF
from ctypes import c_ushort, c_int, c_ulonglong
from time import sleep
from sys import argv
import os

# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>

struct key_t {
    unsigned dev_no;
};

struct val_t {
    u64 bvec_cnt;
    u64 size;
    u64 bio_cnt;
};

BPF_HASH(bvec, struct key_t, struct val_t);

// time block I/O
int trace_submit_bio(struct pt_regs *ctx, struct bio *bio)
{
    unsigned short vcnt;
    unsigned size;

    size = bio->bi_iter.bi_size;
    vcnt = bio->bi_vcnt;

    if (vcnt) {
        struct val_t *valp;
        struct key_t key;
        struct val_t zero = {0};

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
        int maj, min;

        maj = bio->bi_disk->major;
        min = bio->bi_disk->first_minor;
        key.dev_no = (unsigned)MKDEV(maj, min);
#else
        key.dev_no = (unsigned)bio->bi_bdev->bd_dev;
#endif
        valp = bvec.lookup_or_init(&key, &zero);
        valp->bvec_cnt += vcnt;
        valp->size += size;
        valp->bio_cnt += 1;
    }

    //bpf_trace_printk("pages %d, vcnt: %d\\n", size>>12, vcnt);

    return 0;
}

"""

# load BPF program
b = BPF(text=bpf_text);
b.attach_kprobe(event="submit_bio", fn_name="trace_submit_bio")

# header
print("Tracing... Hit Ctrl-C to end.")

# output
try:
    sleep(99999999)
except KeyboardInterrupt:
    pass

page_size = os.sysconf("SC_PAGE_SIZE")
print("\n%-7s %-12s %12s %12s" % ("DEVICE", "PAGES_PER_BVEC", "SIZE_PER_BIO", "VCNT_PER_BIO"))
counts = b.get_table("bvec")
for k, v in counts.items():
    pgs = v.size / page_size
    print("%-3d:%-3d %-12d %12dKB %12d" % (k.dev_no >> 20, k.dev_no & ((1 << 20) - 1), pgs / v.bvec_cnt, (v.size >> 10) / v.bio_cnt, v.bvec_cnt / v.bio_cnt))


[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux