On Thu, 7 Apr 2016 11:54:49 +0800 Ming Lei <tom.leiming@xxxxxxxxx> wrote: > On Mon, Apr 4, 2016 at 1:06 PM, Shaun Tancheff <shaun@xxxxxxxxxxxx> wrote: > > Recursive endio calls can exceed 16k stack. Tested with > > 32k stack and observed: > > > > Depth Size Location (293 entries) > > ----- ---- -------- > > 0) 21520 16 __module_text_address+0x12/0x60 > > 1) 21504 8 __module_address+0x5/0x140 > > 2) 21496 24 __module_text_address+0x12/0x60 > > 3) 21472 16 is_module_text_address+0xe/0x20 > > 4) 21456 8 __kernel_text_address+0x50/0x80 > > 5) 21448 136 print_context_stack+0x5a/0xf0 > > 6) 21312 144 dump_trace+0x14c/0x300 > > 7) 21168 8 save_stack_trace+0x2f/0x50 > > 8) 21160 88 set_track+0x64/0x130 > > 9) 21072 96 free_debug_processing+0x200/0x290 > > 10) 20976 176 __slab_free+0x164/0x290 > > 11) 20800 48 kmem_cache_free+0x1b0/0x1e0 > > 12) 20752 16 mempool_free_slab+0x17/0x20 > > 13) 20736 48 mempool_free+0x2f/0x90 > > 14) 20688 16 bvec_free+0x36/0x40 > > 15) 20672 32 bio_free+0x3b/0x60 > > 16) 20640 16 bio_put+0x23/0x30 > > 17) 20624 64 end_bio_extent_writepage+0xcf/0xe0 > > 18) 20560 48 bio_endio+0x57/0x90 > > 19) 20512 48 btrfs_end_bio+0xa8/0x160 > > 20) 20464 48 bio_endio+0x57/0x90 > > 21) 20416 112 dec_pending+0x121/0x270 > > 22) 20304 64 clone_endio+0x7a/0x100 > > 23) 20240 48 bio_endio+0x57/0x90 > > ... > > 277) 1264 64 clone_endio+0x7a/0x100 > > 278) 1200 48 bio_endio+0x57/0x90 > > 279) 1152 112 dec_pending+0x121/0x270 > > 280) 1040 64 clone_endio+0x7a/0x100 > > 281) 976 48 bio_endio+0x57/0x90 > > 282) 928 80 blk_update_request+0x8f/0x340 > > 283) 848 80 scsi_end_request+0x33/0x1c0 > > 284) 768 112 scsi_io_completion+0xb5/0x620 > > 285) 656 48 scsi_finish_command+0xcf/0x120 > > 286) 608 48 scsi_softirq_done+0x126/0x150 > > 287) 560 24 blk_done_softirq+0x78/0x90 > > 288) 536 136 __do_softirq+0xfd/0x280 > > 289) 400 16 run_ksoftirqd+0x28/0x50 > > 290) 384 64 smpboot_thread_fn+0x105/0x160 > > 291) 320 144 kthread+0xc9/0xe0 > > 292) 176 176 ret_from_fork+0x3f/0x70 > > > > Based on earlier patch by Mikulas Patocka <mpatocka@xxxxxxxxxx>. > > https://lkml.org/lkml/2008/6/24/18 > > Looks a empty link, and the following had the discussion too: > > http://linux-kernel.2935.n7.nabble.com/PATCH-1-2-Avoid-bio-endio-recursion-td306043.html > > > > > Signed-off-by: Shaun Tancheff <shaun.tancheff@xxxxxxxxxxx> > > --- > > block/bio.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 54 insertions(+), 3 deletions(-) > > > > diff --git a/block/bio.c b/block/bio.c > > index d42e69c..4ac19f6 100644 > > --- a/block/bio.c > > +++ b/block/bio.c > > @@ -1733,6 +1733,59 @@ static inline bool bio_remaining_done(struct bio *bio) > > return false; > > } > > > > +static DEFINE_PER_CPU(struct bio **, bio_end_queue) = { NULL }; > > The idea looks very nice, but this patch can't be applid to current block-next > branch. > > Looks it might be simpler to implement the approach by using percpu bio_list. Cc Mikulas & Christoph How about the following implementation with bio_list? Looks more readable and simpler. Just run a recent testcase of bcache over raid1(virtio-blk, virtio-blk), looks it does work, :-) --- diff --git a/block/bio.c b/block/bio.c index f124a0a..b97dfe8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -68,6 +68,8 @@ static DEFINE_MUTEX(bio_slab_lock); static struct bio_slab *bio_slabs; static unsigned int bio_slab_nr, bio_slab_max; +static DEFINE_PER_CPU(struct bio_list *, bio_end_list) = { NULL }; + static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) { unsigned int sz = sizeof(struct bio) + extra_size; @@ -1737,6 +1739,46 @@ static inline bool bio_remaining_done(struct bio *bio) return false; } +/* disable local irq when manipulating the percpu bio_list */ +static void unwind_bio_endio(struct bio *bio) +{ + struct bio_list bl_in_stack; + struct bio_list *bl; + unsigned long flags; + bool clear_list = false; + + local_irq_save(flags); + + bl = this_cpu_read(bio_end_list); + if (!bl) { + bl = &bl_in_stack; + bio_list_init(bl); + clear_list = true; + } + + if (!bio_list_empty(bl)) { + bio_list_add(bl, bio); + goto out; + } + + while (1) { + local_irq_restore(flags); + + if (!bio) + break; + + if (bio->bi_end_io) + bio->bi_end_io(bio); + + local_irq_save(flags); + bio = bio_list_pop(bl); + } + if (clear_list) + this_cpu_write(bio_end_list, NULL); + out: + local_irq_restore(flags); +} + /** * bio_endio - end I/O on a bio * @bio: bio @@ -1765,8 +1807,7 @@ again: goto again; } - if (bio->bi_end_io) - bio->bi_end_io(bio); + unwind_bio_endio(bio); } EXPORT_SYMBOL(bio_endio); > > > + > > +static struct bio *unwind_bio_endio(struct bio *bio) > > +{ > > + struct bio ***bio_end_queue_ptr; > > + struct bio *bio_queue; > > + struct bio *chain_bio = NULL; > > + int error = bio->bi_error; > > + unsigned long flags; > > + > > + local_irq_save(flags); > > If we may resue current->bio_list for this purpose, disabling irq should have > been avoided, but looks it is difficult to do in that way, maybe impossible. > Also not realistic to introduce a new per-task variable. > > > + bio_end_queue_ptr = this_cpu_ptr(&bio_end_queue); > > + > > + if (*bio_end_queue_ptr) { > > + **bio_end_queue_ptr = bio; > > + *bio_end_queue_ptr = &bio->bi_next; > > + bio->bi_next = NULL; > > + } else { > > + bio_queue = NULL; > > + *bio_end_queue_ptr = &bio_queue; > > Suggest to comment that bio_queue is the bottom-most stack variable, > otherwise looks a bit tricky to understand. > > > + > > +next_bio: > > + if (bio->bi_end_io == bio_chain_endio) { > > + struct bio *parent = bio->bi_private; > > + > > + bio_put(bio); > > + chain_bio = parent; > > + goto out; > > + } > > + > > + if (bio->bi_end_io) { > > + if (!bio->bi_error) > > + bio->bi_error = error; > > + bio->bi_end_io(bio); > > + } > > + > > + if (bio_queue) { > > + bio = bio_queue; > > + bio_queue = bio->bi_next; > > + if (!bio_queue) > > + *bio_end_queue_ptr = &bio_queue; > > + goto next_bio; > > + } > > + *bio_end_queue_ptr = NULL; > > + } > > + > > +out: > > + > > + local_irq_restore(flags); > > + > > + return chain_bio; > > +} > > + > > /** > > * bio_endio - end I/O on a bio > > * @bio: bio > > @@ -1762,9 +1815,7 @@ void bio_endio(struct bio *bio) > > bio_put(bio); > > bio = parent; > > } else { > > - if (bio->bi_end_io) > > - bio->bi_end_io(bio); > > - bio = NULL; > > + bio = unwind_bio_endio(bio); > > } > > } > > } > > -- > > 1.9.1 > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-block" in > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html