On 4/13/22 08:00, Mike Snitzer wrote: > On Tue, Apr 12 2022 at 6:38P -0400, > Damien Le Moal <damien.lemoal@xxxxxxxxxxxxxxxxxx> wrote: > >> On 4/13/22 05:52, Mike Snitzer wrote: >>> On Tue, Apr 12 2022 at 4:56P -0400, >>> Ming Lei <ming.lei@xxxxxxxxxx> wrote: >>> >>>> The current DM codes setup ->orig_bio after __map_bio() returns, >>>> and not only cause kernel panic for dm zone, but also a bit ugly >>>> and tricky, especially the waiting until ->orig_bio is set in >>>> dm_submit_bio_remap(). >>>> >>>> The reason is that one new bio is cloned from original FS bio to >>>> represent the mapped part, which just serves io accounting. >>>> >>>> Now we have switched to bdev based io accounting interface, and we >>>> can retrieve sectors/bio_op from both the real original bio and the >>>> added fields of .sector_offset & .sectors easily, so the new cloned >>>> bio isn't necessary any more. >>>> >>>> Not only fixes dm-zone's kernel panic, but also cleans up dm io >>>> accounting & split a bit. >>> >>> You're conflating quite a few things here. DM zone really has no >>> business accessing io->orig_bio (dm-zone.c can just as easily inspect >>> the tio->clone, because it hasn't been remapped yet it reflects the >>> io->origin_bio, so there is no need to look at io->orig_bio) -- but >>> yes I clearly broke things during the 5.18 merge and it needs fixing >>> ASAP. >> >> Problem is that we need to look at the BIO op in submission AND completion >> path to handle zone append requests. So looking at the clone on submission >> is OK since its op is still the original one. But on the completion path, >> the clone may now be a regular write emulating a zone append op. And >> looking at the clone only does not allow detecting that zone append. >> >> We could have the orig_bio op saved in dm_io to avoid completely looking >> at the orig_bio for detecting zone append. > > Can you please try the following patch? This works. I tested with a zoned nullblk + dm-crypt, forcing the zone append emulation code to be used. I ran zonefs tests on top of that with no issues. I will run btrfs tests too later today to exercise things a little more. > > Really sorry for breaking dm-zone.c; please teach this man how to test > the basics of all things dm-zoned (is there a testsuite in the tools > or something?). > > Thanks, > Mike > > diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c > index c1ca9be4b79e..896127366000 100644 > --- a/drivers/md/dm-zone.c > +++ b/drivers/md/dm-zone.c > @@ -360,16 +360,20 @@ static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, > return 0; > } > > +struct orig_bio_details { > + unsigned int op; > + unsigned int nr_sectors; > +}; > + > /* > * First phase of BIO mapping for targets with zone append emulation: > * check all BIO that change a zone writer pointer and change zone > * append operations into regular write operations. > */ > -static bool dm_zone_map_bio_begin(struct mapped_device *md, > - struct bio *orig_bio, struct bio *clone) > +static bool dm_zone_map_bio_begin(struct mapped_device *md, unsigned int zno, > + struct bio *clone) > { > sector_t zsectors = blk_queue_zone_sectors(md->queue); > - unsigned int zno = bio_zone_no(orig_bio); > unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); > > /* > @@ -384,7 +388,7 @@ static bool dm_zone_map_bio_begin(struct mapped_device *md, > WRITE_ONCE(md->zwp_offset[zno], zwp_offset); > } > > - switch (bio_op(orig_bio)) { > + switch (bio_op(clone)) { > case REQ_OP_ZONE_RESET: > case REQ_OP_ZONE_FINISH: > return true; > @@ -401,9 +405,8 @@ static bool dm_zone_map_bio_begin(struct mapped_device *md, > * target zone. > */ > clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | > - (orig_bio->bi_opf & (~REQ_OP_MASK)); > - clone->bi_iter.bi_sector = > - orig_bio->bi_iter.bi_sector + zwp_offset; > + (clone->bi_opf & (~REQ_OP_MASK)); > + clone->bi_iter.bi_sector += zwp_offset; > break; > default: > DMWARN_LIMIT("Invalid BIO operation"); > @@ -423,11 +426,10 @@ static bool dm_zone_map_bio_begin(struct mapped_device *md, > * data written to a zone. Note that at this point, the remapped clone BIO > * may already have completed, so we do not touch it. > */ > -static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, > - struct bio *orig_bio, > +static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno, > + struct orig_bio_details *orig_bio_details, > unsigned int nr_sectors) > { > - unsigned int zno = bio_zone_no(orig_bio); > unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); > > /* The clone BIO may already have been completed and failed */ > @@ -435,7 +437,7 @@ static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, > return BLK_STS_IOERR; > > /* Update the zone wp offset */ > - switch (bio_op(orig_bio)) { > + switch (orig_bio_details->op) { > case REQ_OP_ZONE_RESET: > WRITE_ONCE(md->zwp_offset[zno], 0); > return BLK_STS_OK; > @@ -452,7 +454,7 @@ static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, > * Check that the target did not truncate the write operation > * emulating a zone append. > */ > - if (nr_sectors != bio_sectors(orig_bio)) { > + if (nr_sectors != orig_bio_details->nr_sectors) { > DMWARN_LIMIT("Truncated write for zone append"); > return BLK_STS_IOERR; > } > @@ -488,7 +490,7 @@ static inline void dm_zone_unlock(struct request_queue *q, > bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); > } > > -static bool dm_need_zone_wp_tracking(struct bio *orig_bio) > +static bool dm_need_zone_wp_tracking(struct bio *clone) > { > /* > * Special processing is not needed for operations that do not need the > @@ -496,15 +498,15 @@ static bool dm_need_zone_wp_tracking(struct bio *orig_bio) > * zones and all operations that do not modify directly a sequential > * zone write pointer. > */ > - if (op_is_flush(orig_bio->bi_opf) && !bio_sectors(orig_bio)) > + if (op_is_flush(clone->bi_opf) && !bio_sectors(clone)) > return false; > - switch (bio_op(orig_bio)) { > + switch (bio_op(clone)) { > case REQ_OP_WRITE_ZEROES: > case REQ_OP_WRITE: > case REQ_OP_ZONE_RESET: > case REQ_OP_ZONE_FINISH: > case REQ_OP_ZONE_APPEND: > - return bio_zone_is_seq(orig_bio); > + return bio_zone_is_seq(clone); > default: > return false; > } > @@ -519,8 +521,8 @@ int dm_zone_map_bio(struct dm_target_io *tio) > struct dm_target *ti = tio->ti; > struct mapped_device *md = io->md; > struct request_queue *q = md->queue; > - struct bio *orig_bio = io->orig_bio; > struct bio *clone = &tio->clone; > + struct orig_bio_details orig_bio_details; > unsigned int zno; > blk_status_t sts; > int r; > @@ -529,18 +531,21 @@ int dm_zone_map_bio(struct dm_target_io *tio) > * IOs that do not change a zone write pointer do not need > * any additional special processing. > */ > - if (!dm_need_zone_wp_tracking(orig_bio)) > + if (!dm_need_zone_wp_tracking(clone)) > return ti->type->map(ti, clone); > > /* Lock the target zone */ > - zno = bio_zone_no(orig_bio); > + zno = bio_zone_no(clone); > dm_zone_lock(q, zno, clone); > > + orig_bio_details.nr_sectors = bio_sectors(clone); > + orig_bio_details.op = bio_op(clone); > + > /* > * Check that the bio and the target zone write pointer offset are > * both valid, and if the bio is a zone append, remap it to a write. > */ > - if (!dm_zone_map_bio_begin(md, orig_bio, clone)) { > + if (!dm_zone_map_bio_begin(md, zno, clone)) { > dm_zone_unlock(q, zno, clone); > return DM_MAPIO_KILL; > } > @@ -560,7 +565,8 @@ int dm_zone_map_bio(struct dm_target_io *tio) > * The target submitted the clone BIO. The target zone will > * be unlocked on completion of the clone. > */ > - sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr); > + sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, > + *tio->len_ptr); > break; > case DM_MAPIO_REMAPPED: > /* > @@ -568,7 +574,8 @@ int dm_zone_map_bio(struct dm_target_io *tio) > * unlock the target zone here as the clone will not be > * submitted. > */ > - sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr); > + sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, > + *tio->len_ptr); > if (sts != BLK_STS_OK) > dm_zone_unlock(q, zno, clone); > break; > -- Damien Le Moal Western Digital Research