Re: [PATCH] block: Fix __blkdev_direct_IO()

Damien Le Moal <Damien.LeMoal@xxxxxxx> · Fri, 9 Aug 2019 13:01:54 +0000

On 2019/08/07 4:53, Jens Axboe wrote:
>> Hi Damien,
>>
>> Had you verified this patch with blktests and KASAN enabled? I think the
>> above patch introduced the following KASAN complaint:
> 
> I posted this in another thread, can you try?
> 
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index a6f7c892cb4a..131e2e0582a6 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  	loff_t pos = iocb->ki_pos;
>  	blk_qc_t qc = BLK_QC_T_NONE;
>  	gfp_t gfp;
> -	ssize_t ret;
> +	int ret;
>  
>  	if ((pos | iov_iter_alignment(iter)) &
>  	    (bdev_logical_block_size(bdev) - 1))
> @@ -386,8 +386,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  
>  	ret = 0;
>  	for (;;) {
> -		int err;
> -
>  		bio_set_dev(bio, bdev);
>  		bio->bi_iter.bi_sector = pos >> 9;
>  		bio->bi_write_hint = iocb->ki_hint;
> @@ -395,10 +393,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  		bio->bi_end_io = blkdev_bio_end_io;
>  		bio->bi_ioprio = iocb->ki_ioprio;
>  
> -		err = bio_iov_iter_get_pages(bio, iter);
> -		if (unlikely(err)) {
> -			if (!ret)
> -				ret = err;
> +		ret = bio_iov_iter_get_pages(bio, iter);
> +		if (unlikely(ret)) {
>  			bio->bi_status = BLK_STS_IOERR;
>  			bio_endio(bio);
>  			break;
> @@ -421,7 +417,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  		if (nowait)
>  			bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
>  
> -		dio->size += bio->bi_iter.bi_size;
>  		pos += bio->bi_iter.bi_size;
>  
>  		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
> @@ -433,13 +428,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  				polled = true;
>  			}
>  
> +			dio->size += bio->bi_iter.bi_size;
>  			qc = submit_bio(bio);
>  			if (qc == BLK_QC_T_EAGAIN) {
> -				if (!ret)
> -					ret = -EAGAIN;
> +				dio->size -= bio->bi_iter.bi_size;
> +				ret = -EAGAIN;
>  				goto error;
>  			}
> -			ret = dio->size;
>  
>  			if (polled)
>  				WRITE_ONCE(iocb->ki_cookie, qc);
> @@ -460,18 +455,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  			atomic_inc(&dio->ref);
>  		}
>  
> +		dio->size += bio->bi_iter.bi_size;
>  		qc = submit_bio(bio);
>  		if (qc == BLK_QC_T_EAGAIN) {
> -			if (!ret)
> -				ret = -EAGAIN;
> +			dio->size -= bio->bi_iter.bi_size;
> +			ret = -EAGAIN;
>  			goto error;
>  		}
> -		ret = dio->size;
>  
>  		bio = bio_alloc(gfp, nr_pages);
>  		if (!bio) {
> -			if (!ret)
> -				ret = -EAGAIN;
> +			ret = -EAGAIN;
>  			goto error;
>  		}
>  	}
> @@ -496,6 +490,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
>  out:
>  	if (!ret)
>  		ret = blk_status_to_errno(dio->bio.bi_status);
> +	if (likely(!ret))
> +		ret = dio->size;
>  
>  	bio_put(&dio->bio);
>  	return ret;
> 

Jens,

I tested a slightly modified version of your patch. I think it is 100%
equivalent, but a little cleaner in my opinion.

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a6f7c892cb4a..343cd716c53b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
        loff_t pos = iocb->ki_pos;
        blk_qc_t qc = BLK_QC_T_NONE;
        gfp_t gfp;
-       ssize_t ret;
+       ssize_t ret = 0;

        if ((pos | iov_iter_alignment(iter)) &
            (bdev_logical_block_size(bdev) - 1))
@@ -386,7 +386,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)

        ret = 0;
        for (;;) {
-               int err;
+               unsigned int bio_size;

                bio_set_dev(bio, bdev);
                bio->bi_iter.bi_sector = pos >> 9;
@@ -395,10 +395,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
                bio->bi_end_io = blkdev_bio_end_io;
                bio->bi_ioprio = iocb->ki_ioprio;

-               err = bio_iov_iter_get_pages(bio, iter);
-               if (unlikely(err)) {
-                       if (!ret)
-                               ret = err;
+               ret = bio_iov_iter_get_pages(bio, iter);
+               if (unlikely(ret)) {
                        bio->bi_status = BLK_STS_IOERR;
                        bio_endio(bio);
                        break;
@@ -421,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
                if (nowait)
                        bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);

-               dio->size += bio->bi_iter.bi_size;
+               bio_size = bio->bi_iter.bi_size;
+               dio->size += bio_size;
                pos += bio->bi_iter.bi_size;

                nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
@@ -435,11 +434,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)

                        qc = submit_bio(bio);
                        if (qc == BLK_QC_T_EAGAIN) {
-                               if (!ret)
+                               dio->size -= bio_size;
+                               if (!dio->size)
                                        ret = -EAGAIN;
                                goto error;
                        }
-                       ret = dio->size;

                        if (polled)
                                WRITE_ONCE(iocb->ki_cookie, qc);
@@ -462,18 +461,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)

                qc = submit_bio(bio);
                if (qc == BLK_QC_T_EAGAIN) {
-                       if (!ret)
+                       dio->size -= bio_size;
+                       if (!dio->size)
                                ret = -EAGAIN;
                        goto error;
                }
-               ret = dio->size;

                bio = bio_alloc(gfp, nr_pages);
-               if (!bio) {
-                       if (!ret)
-                               ret = -EAGAIN;
+               if (!bio)
                        goto error;
-               }
        }

        if (!is_poll)
@@ -496,6 +492,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
 out:
        if (!ret)
                ret = blk_status_to_errno(dio->bio.bi_status);
+       if (likely(!ret))
+               ret = dio->size;

        bio_put(&dio->bio);
        return ret;

To test and make sure that I can consistently create EAGAIN errors with
RWF_NOWAIT, I set an HDD QD to 1, its nr_requests to 4 (minimum), enable
"nomerges" and set the disk max_sectors_kb to 512 KB. With this, at most 4 512KB
requests can be issued at any time, meaning that for a DIO larger than
5123KBx4=2MB, BIO fragments issuing will likely hit EAGAIN due to the lack of tags.

With this, here is what I get for tests with 4MB direct write IOs, with all
cases (sync/async, nowait or not).

## sync, wait case
4194304 4194304			<- requested size, size return by pwritev2()
0 KB read, 4096 KB writen	<- from iostat, KB amount read/written

regular sync+wait case is OK.

## sync, nowait case
4194304 4194304			<- KASAN use-after-free warning
0 KB read, 4608 KB writen	<- One more 512KB BIO issued than reported

With sync+nowait, sizes are correct but iostat shows too many writes. Also of
note is that there was some fragments failed on submission with EAGAIN, and
despite that, the size returned match the size asked for instead of a smaller size.

## async, wait case
4194304 4194304
0 KB read, 4096 KB writen

## async, nowait case
4194304 4194304
0 KB read, 4096 KB writen

The 2 async cases are OK.

Something is wrong with the sync+nowait case, but I cannot quite yet figure out
what. What I gather so far is this:
- BIO_MAX_PAGES being 256, the BIO fragments are 1MB. But the disk max_sector_kb
being 512KB, the BIO fragments get split into 2 512KB requests. It looks like
one of the fragment split is failed with EAGAIN, but the other split makes it to
the disk, showing an excess 512KB. Changing the disk max_sector_kb to 256 or
128, I see the excess being exactly that every time. That said, the total size
should still be lower than the requested 4MB since the BIO fragments submission
stopped before the entire dio was processed.

I have been playing around with the following patch on top of the previous one,
to make sure that BIOs are being freed (I believe the current code leaks them on
EAGAIN) and the dio completed.

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 343cd716c53b..bbb694c4a8c4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -295,6 +295,31 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
        return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
 }

+static void blkdev_dio_end(struct blkdev_dio *dio)
+{
+       pr_info("--- DIO done %zu B\n", dio->size);
+       if (!dio->is_sync) {
+               struct kiocb *iocb = dio->iocb;
+               ssize_t ret;
+
+               if (likely(!dio->bio.bi_status)) {
+                       ret = dio->size;
+                       iocb->ki_pos += ret;
+               } else {
+                       ret = blk_status_to_errno(dio->bio.bi_status);
+               }
+
+               dio->iocb->ki_complete(iocb, ret, 0);
+               if (dio->multi_bio)
+                       bio_put(&dio->bio);
+       } else {
+               struct task_struct *waiter = dio->waiter;
+
+               WRITE_ONCE(dio->waiter, NULL);
+               blk_wake_io_task(waiter);
+       }
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
        struct blkdev_dio *dio = bio->bi_private;
@@ -303,28 +328,9 @@ static void blkdev_bio_end_io(struct bio *bio)
        if (bio->bi_status && !dio->bio.bi_status)
                dio->bio.bi_status = bio->bi_status;

-       if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
-               if (!dio->is_sync) {
-                       struct kiocb *iocb = dio->iocb;
-                       ssize_t ret;
-
-                       if (likely(!dio->bio.bi_status)) {
-                               ret = dio->size;
-                               iocb->ki_pos += ret;
-                       } else {
-                               ret = blk_status_to_errno(dio->bio.bi_status);
-                       }
-
-                       dio->iocb->ki_complete(iocb, ret, 0);
-                       if (dio->multi_bio)
-                               bio_put(&dio->bio);
-               } else {
-                       struct task_struct *waiter = dio->waiter;
-
-                       WRITE_ONCE(dio->waiter, NULL);
-                       blk_wake_io_task(waiter);
-               }
-       }
+       pr_info("--- BIO done, dio %zu B\n", dio->size);
+       if (!dio->multi_bio || atomic_dec_and_test(&dio->ref))
+               blkdev_dio_end(dio);

        if (should_dirty) {
                bio_check_pages_dirty(bio);
@@ -351,6 +357,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
        gfp_t gfp;
        ssize_t ret = 0;

+       pr_info("====== __blkdev_direct_IO ======\n");
+
        if ((pos | iov_iter_alignment(iter)) &
            (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;
@@ -421,7 +429,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)

                bio_size = bio->bi_iter.bi_size;
                dio->size += bio_size;
-               pos += bio->bi_iter.bi_size;
+               pos += bio_size;
+
+               pr_info("+++ BIO frag %u B\n", bio_size);

                nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
                if (!nr_pages) {
@@ -435,9 +445,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
                        qc = submit_bio(bio);
                        if (qc == BLK_QC_T_EAGAIN) {
                                dio->size -= bio_size;
-                               if (!dio->size)
-                                       ret = -EAGAIN;
-                               goto error;
+                               ret = -EAGAIN;
+                               pr_info("last BIO submit eagain, dio %zu B\n",
dio->size);
+                               break;
                        }

                        if (polled)
@@ -462,19 +472,53 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
                qc = submit_bio(bio);
                if (qc == BLK_QC_T_EAGAIN) {
                        dio->size -= bio_size;
-                       if (!dio->size)
-                               ret = -EAGAIN;
-                       goto error;
+                       pr_info("BIO submit eagain, dio %zu B\n", dio->size);
+                       /*
+                        * The reference on dio taken above is for the next
+                        * fragment, not his one.
+                        */
+                       atomic_dec(&dio->ref);
+                       ret = -EAGAIN;
+                       break;
                }

                bio = bio_alloc(gfp, nr_pages);
-               if (!bio)
-                       goto error;
+               if (!bio) {
+                       pr_info("BIO alloc eagain, dio %zu B\n", dio->size);
+                       ret = -EAGAIN;
+                       break;
+               }
        }

        if (!is_poll)
                blk_finish_plug(&plug);

+       if (ret == -EAGAIN) {
+               WARN_ON(!nowait);
+
+               /*
+                * If no BIO could be issued in the nowait case, fail everything
+                * with EAGAIN.
+                */
+               if (!dio->size)
+                       goto out;
+
+               /*
+                * A fragment of a multi-bio dio could not be issued in the
+                * nowait case: allow issued fragments to proceed (resulting in
+                * a short dio) and ensure completion of the dio by completing
+                * the failed fragment to decrement the extra reference taken
+                * on the dio and release the bio pages.
+                */
+               pr_info("DIO partial %zu B\n", dio->size);
+               WARN_ON(bio == &dio->bio);
+               if (bio) {
+                       bio_uninit(bio);
+                       blkdev_bio_end_io(bio);
+               }
+               ret = 0;
+       }
+
        if (!is_sync)
                return -EIOCBQUEUED;

@@ -497,10 +541,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)

        bio_put(&dio->bio);
        return ret;
-error:
-       if (!is_poll)
-               blk_finish_plug(&plug);
-       goto out;
 }

 static ssize_t

With this patch, I get a different behavior:

## sync, wait
4194304 4194304
0 KB read, 4096 KB writen

No change here.

## sync, nowait
... KASAN warning and hang, no output from test application
The debug messages show this:

[   90.142525] ====== __blkdev_direct_IO ======
[   90.147503] +++ BIO frag 1048576 B
[   90.151729] +++ BIO frag 1048576 B
[   90.155256] BIO submit eagain, dio 1048576 B
[   90.159571] DIO partial 1048576 B
[   90.162914] --- BIO done, dio 1048576 B
[   90.167345] --- BIO done, dio 1048576 B
[   90.171237] --- DIO done 1048576 B

That is, the first 1MB fragment is issued, the second 1MB fragment hits -EAGAIN
on submission and the DIO is completed as partial 1MB. But there are 2 fragment
completion shown being processed by blkdev_bio_end_io(), while only one was
issued. Right after these messages, KASAN screams:

==================================================================
[   90.182106] BUG: KASAN: use-after-free in blk_rq_map_sg+0x1227/0x1660
[   90.188561] Read of size 8 at addr ffff8890079b1100 by task kworker/31:1H/932
[   90.195725]
[   90.197235] CPU: 31 PID: 932 Comm: kworker/31:1H Not tainted 5.3.0-rc3+ #211

Clearly, a BIO is being freed while being executed, likely one of the split of
the large 1MB BIO that was failed with EAGAIN.

## async, wait
4194304 4194304
0 KB read, 4096 KB writen

No problem here, same as sync+wait case.

## async, nowait
4194304 3145728
0 KB read, 3072 KB writen

Here, the application return and shows a partial short write of 3MB. The debug
messages show that the last fragment failed submission on EAGAIN, leading to the
correctly reported 3MB short write.

[  190.437652] ====== __blkdev_direct_IO ======
[  190.442590] +++ BIO frag 1048576 B
[  190.446688] +++ BIO frag 1048576 B
[  190.450795] +++ BIO frag 1048576 B
[  190.454237] --- BIO done, dio 3145728 B
[  190.458790] +++ BIO frag 1048576 B
[  190.462297] last BIO submit eagain, dio 3145728 B
[  190.467045] DIO partial 3145728 B
[  190.470388] --- BIO done, dio 3145728 B
[  190.474288] --- BIO done, dio 3145728 B
[  190.480198] --- BIO done, dio 3145728 B
[  190.484092] --- DIO done 3145728 B

But still getting KASAN warning and hang. The warning is the same as sync+nowait
case:

==================================================================
[  190.494974] BUG: KASAN: use-after-free in blk_rq_map_sg+0x1227/0x1660
[  190.501433] Read of size 8 at addr ffff889050965500 by task kworker/31:1H/1125

And the hang is also the same, an oops with an an infinite stream of the
following messages.

[  190.917591] swiotlb_tbl_map_single: 4 callbacks suppressed
[  190.917596] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  190.932770] mpt3sas 0000:d8:00.0: overflow 0xffe7a3fcc18c0000+4096 of DMA
mask 7fffffffffffffff bus mask 0
[  190.942450] WARNING: CPU: 31 PID: 1125 at kernel/dma/direct.c:35
report_addr+0xc5/0x170
[  190.950460] Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver
nfs lockd grace fscache nf_conntrack_netbios_ns nf_conntrack_broadcast xt_CT
ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip6table_mangle
ip6table_raw ip6table_security iptable_mangle iptable_raw iptable_security
nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c ip_set nfnetlink
ip6table_filter ip6_tables sunrpc x86_pkg_temp_thermal coretemp kvm_intel kvm
crc32_pclmul acpi_power_meter mpt3sas i40e nvme raid_class nvme_core skd
[  190.996072] CPU: 31 PID: 1125 Comm: kworker/31:1H Tainted: G    B
5.3.0-rc3+ #211
[  191.004598] Hardware name: Supermicro Super Server/X11DPL-i, BIOS 3.0b 03/04/2019
[  191.012088] Workqueue: kblockd blk_mq_run_work_fn
[  191.016805] RIP: 0010:report_addr+0xc5/0x170
[  191.021086] Code: e1 48 c1 e9 03 80 3c 01 00 0f 85 97 00 00 00 4d 8b 24 24 b8
fe ff ff ff 49 39 c4 76 3a 80 3d 39 57 81 02 00 0f 84 a0 00 00 00 <0f> 0b 48 b8
00 00 00 00 00 fc ff df 48 c7 04 03 00 00 00 00 48 8b
[  191.039855] RSP: 0018:ffff88904f8776e8 EFLAGS: 00010282
[  191.045091] RAX: 0000000000000000 RBX: 1ffff11209f0eede RCX: 0000000000000000
[  191.052230] RDX: ffffed1209f0ee84 RSI: 0000000000000000 RDI: ffffed1209f0eed0
[  191.059373] RBP: ffff888817606f28 R08: 000000000000005e R09: ffffed120b2f7689
[  191.066513] R10: ffffed120b2f7688 R11: ffff8890597bb447 R12: 7fffffffffffffff
[  191.073656] R13: 7fffffffffffffff R14: ffe7a3fcc18c0fff R15: ffff888620280abc
[  191.080795] FS:  0000000000000000(0000) GS:ffff889059780000(0000)
knlGS:0000000000000000
[  191.088889] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  191.094646] CR2: 00007fafc9cea000 CR3: 000000107d614003 CR4: 00000000005606e0
[  191.101788] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  191.108929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  191.116068] PKRU: 55555554
[  191.118781] Call Trace:
[  191.121239]  ? dma_direct_unmap_sg+0xb0/0xb0
[  191.125516]  ? swiotlb_map+0xd7/0x490
[  191.129192]  ? _raw_spin_unlock_irqrestore+0x45/0x50
[  191.134169]  dma_direct_map_page+0x23b/0x2f0
[  191.138447]  ? __kasan_report.cold+0xd/0x3e
[  191.142643]  ? report_addr+0x170/0x170
[  191.146403]  ? blk_rq_map_sg+0x1227/0x1660
[  191.150510]  ? blk_rq_map_sg+0x1118/0x1660
[  191.154623]  dma_direct_map_sg+0x108/0x1e0
[  191.158731]  scsi_dma_map+0x18e/0x290
[  191.162412]  _base_build_sg_scmd_ieee+0x134/0x1870 [mpt3sas]
[  191.168079]  ? memset+0x20/0x40
[  191.171241]  scsih_qcmd+0xfb7/0x1a20 [mpt3sas]
[  191.175701]  scsi_queue_rq+0xdb4/0x2ce0
[  191.179547]  blk_mq_dispatch_rq_list+0xbbc/0x1560
[  191.184261]  ? elv_rb_del+0x3b/0x80
[  191.187762]  ? blk_mq_get_driver_tag+0x4c0/0x4c0
[  191.192388]  ? dd_dispatch_request+0x210/0x930
[  191.196847]  blk_mq_do_dispatch_sched+0x16c/0x380
[  191.201560]  ? blk_mq_sched_mark_restart_hctx+0x60/0x60
[  191.206793]  ? __lock_acquire+0xd55/0x4560
[  191.210901]  blk_mq_sched_dispatch_requests+0x33c/0x530
[  191.216134]  ? lock_acquire+0x11d/0x2f0
[  191.219984]  ? blk_mq_sched_restart+0x70/0x70
[  191.224351]  ? process_one_work+0x677/0x1270
[  191.228635]  __blk_mq_run_hw_queue+0xeb/0x210
[  191.232999]  ? hctx_lock+0x160/0x160
[  191.236589]  ? process_one_work+0x693/0x1270
[  191.240872]  process_one_work+0x73a/0x1270
[  191.244981]  ? pwq_dec_nr_in_flight+0x2d0/0x2d0
[  191.249518]  ? do_raw_spin_lock+0x120/0x280
[  191.253718]  worker_thread+0x94/0xc70
[  191.257391]  ? __kthread_parkme+0xbd/0x1a0
[  191.261498]  ? process_one_work+0x1270/0x1270
[  191.265864]  kthread+0x2ed/0x3f0
[  191.269104]  ? kthread_park+0x130/0x130
[  191.272954]  ret_from_fork+0x24/0x30
[  191.276547] irq event stamp: 90
[  191.279695] hardirqs last  enabled at (89): [<ffffffff812bb154>]
ktime_get+0x104/0x130
[  191.287616] hardirqs last disabled at (90): [<ffffffff8282f1b4>]
_raw_spin_lock_irqsave+0x14/0x50
[  191.296491] softirqs last  enabled at (52): [<ffffffff8232df68>]
flush_backlog+0x378/0x5a0
[  191.304758] softirqs last disabled at (48): [<ffffffff8232dbf5>]
flush_backlog+0x5/0x5a0
[  191.312853] ---[ end trace f878467310c76256 ]---
[  191.317491] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.343446] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.353128] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.387552] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.397268] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.431556] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.441264] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.475552] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.485263] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.519550] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.529265] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.563542] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.573259] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.607551] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.617264] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.651549] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.661255] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.695545] mpt3sas 0000:d8:00.0: swiotlb buffer is full (sz: 4096 bytes),
total 0 (slots), used 0 (slots)
[  191.705256] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.739546] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.773545] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.807549] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.841553] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.875544] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.909547] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.943466] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  191.977543] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  192.011547] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  192.045550] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  192.079539] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  192.113537] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!
[  192.147543] sd 10:0:0:0: scsi_dma_map failed: request for 524288 bytes!

I am still digging into all this, but strongly suspecting that this is related
to the NOWAIT flag not correctly handling cases where a BIO gets split into
smaller fragments and some of the fragments failing to be created with -EAGAIN.
Not 100% sure yet though.

Best regards.

-- 
Damien Le Moal
Western Digital Research