On 2021/05/20 15:10, Hannes Reinecke wrote: [...] >> +/* >> + * First phase of BIO mapping for targets with zone append emulation: >> + * check all BIO that change a zone writer pointer and change zone >> + * append operations into regular write operations. >> + */ >> +static bool dm_zone_map_bio_begin(struct mapped_device *md, >> + struct bio *orig_bio, struct bio *clone) >> +{ >> + sector_t zone_sectors = blk_queue_zone_sectors(md->queue); >> + unsigned int zno = bio_zone_no(orig_bio); >> + unsigned long flags; >> + bool good_io = false; >> + >> + spin_lock_irqsave(&md->zwp_offset_lock, flags); >> + >> + /* >> + * If the target zone is in an error state, recover by inspecting the >> + * zone to get its current write pointer position. Note that since the >> + * target zone is already locked, a BIO issuing context should never >> + * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. >> + */ >> + if (md->zwp_offset[zno] == DM_ZONE_INVALID_WP_OFST) { >> + unsigned int wp_offset; >> + int ret; >> + >> + md->zwp_offset[zno] = DM_ZONE_UPDATING_WP_OFST; >> + >> + spin_unlock_irqrestore(&md->zwp_offset_lock, flags); >> + ret = dm_update_zone_wp_offset(md, zno, &wp_offset); >> + spin_lock_irqsave(&md->zwp_offset_lock, flags); >> + >> + if (ret) { >> + md->zwp_offset[zno] = DM_ZONE_INVALID_WP_OFST; >> + goto out; >> + } >> + md->zwp_offset[zno] = wp_offset; >> + } else if (md->zwp_offset[zno] == DM_ZONE_UPDATING_WP_OFST) { >> + DMWARN_LIMIT("Invalid DM_ZONE_UPDATING_WP_OFST state"); >> + goto out; >> + } >> + >> + switch (bio_op(orig_bio)) { >> + case REQ_OP_WRITE_ZEROES: >> + case REQ_OP_WRITE_SAME: >> + case REQ_OP_WRITE: >> + break; >> + case REQ_OP_ZONE_RESET: >> + case REQ_OP_ZONE_FINISH: >> + goto good; >> + case REQ_OP_ZONE_APPEND: >> + /* >> + * Change zone append operations into a non-mergeable regular >> + * writes directed at the current write pointer position of the >> + * target zone. >> + */ >> + clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | >> + (orig_bio->bi_opf & (~REQ_OP_MASK)); >> + clone->bi_iter.bi_sector = >> + orig_bio->bi_iter.bi_sector + md->zwp_offset[zno]; >> + break; >> + default: >> + DMWARN_LIMIT("Invalid BIO operation"); >> + goto out; >> + } >> + >> + /* Cannot write to a full zone */ >> + if (md->zwp_offset[zno] >= zone_sectors) >> + goto out; >> + >> + /* Writes must be aligned to the zone write pointer */ >> + if ((clone->bi_iter.bi_sector & (zone_sectors - 1)) != md->zwp_offset[zno]) >> + goto out; >> + >> +good: >> + good_io = true; >> + >> +out: >> + spin_unlock_irqrestore(&md->zwp_offset_lock, flags); > > I'm not happy with the spinlock. Can't the same effect be achieved with > some clever READ_ONCE()/WRITE_ONCE/cmpexch magic? > Especially as you have a separate 'zone lock' mechanism ... hmmm... Let me see. Given that what the bio completion is relatively simple, it may be possible. With more coffee, I amy be able to come up with something clever. > >> + >> + return good_io; >> +} >> + >> +/* >> + * Second phase of BIO mapping for targets with zone append emulation: >> + * update the zone write pointer offset array to account for the additional >> + * data written to a zone. Note that at this point, the remapped clone BIO >> + * may already have completed, so we do not touch it. >> + */ >> +static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, >> + struct bio *orig_bio, >> + unsigned int nr_sectors) >> +{ >> + unsigned int zno = bio_zone_no(orig_bio); >> + blk_status_t sts = BLK_STS_OK; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&md->zwp_offset_lock, flags); >> + >> + /* Update the zone wp offset */ >> + switch (bio_op(orig_bio)) { >> + case REQ_OP_ZONE_RESET: >> + md->zwp_offset[zno] = 0; >> + break; >> + case REQ_OP_ZONE_FINISH: >> + md->zwp_offset[zno] = blk_queue_zone_sectors(md->queue); >> + break; >> + case REQ_OP_WRITE_ZEROES: >> + case REQ_OP_WRITE_SAME: >> + case REQ_OP_WRITE: >> + md->zwp_offset[zno] += nr_sectors; >> + break; >> + case REQ_OP_ZONE_APPEND: >> + /* >> + * Check that the target did not truncate the write operation >> + * emulating a zone append. >> + */ >> + if (nr_sectors != bio_sectors(orig_bio)) { >> + DMWARN_LIMIT("Truncated write for zone append"); >> + sts = BLK_STS_IOERR; >> + break; >> + } >> + md->zwp_offset[zno] += nr_sectors; >> + break; >> + default: >> + DMWARN_LIMIT("Invalid BIO operation"); >> + sts = BLK_STS_IOERR; >> + break; >> + } >> + >> + spin_unlock_irqrestore(&md->zwp_offset_lock, flags); > > You don't need the spinlock here; using WRITE_ONCE() should be sufficient. If other references to zwp_offset use READ_ONCE(), no ? [...] >> +void dm_zone_endio(struct dm_io *io, struct bio *clone) >> +{ >> + struct mapped_device *md = io->md; >> + struct request_queue *q = md->queue; >> + struct bio *orig_bio = io->orig_bio; >> + unsigned long flags; >> + unsigned int zno; >> + >> + /* >> + * For targets that do not emulate zone append, we only need to >> + * handle native zone-append bios. >> + */ >> + if (!dm_emulate_zone_append(md)) { >> + /* >> + * Get the offset within the zone of the written sector >> + * and add that to the original bio sector position. >> + */ >> + if (clone->bi_status == BLK_STS_OK && >> + bio_op(clone) == REQ_OP_ZONE_APPEND) { >> + sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1; >> + >> + orig_bio->bi_iter.bi_sector += >> + clone->bi_iter.bi_sector & mask; >> + } >> + >> + return; >> + } >> + >> + /* >> + * For targets that do emulate zone append, if the clone BIO does not >> + * own the target zone write lock, we have nothing to do. >> + */ >> + if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) >> + return; >> + >> + zno = bio_zone_no(orig_bio); >> + >> + spin_lock_irqsave(&md->zwp_offset_lock, flags); >> + if (clone->bi_status != BLK_STS_OK) { >> + /* >> + * BIOs that modify a zone write pointer may leave the zone >> + * in an unknown state in case of failure (e.g. the write >> + * pointer was only partially advanced). In this case, set >> + * the target zone write pointer as invalid unless it is >> + * already being updated. >> + */ >> + if (md->zwp_offset[zno] != DM_ZONE_UPDATING_WP_OFST) >> + md->zwp_offset[zno] = DM_ZONE_INVALID_WP_OFST; >> + } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { >> + /* >> + * Get the written sector for zone append operation that were >> + * emulated using regular write operations. >> + */ >> + if (WARN_ON_ONCE(md->zwp_offset[zno] < bio_sectors(orig_bio))) >> + md->zwp_offset[zno] = DM_ZONE_INVALID_WP_OFST; >> + else >> + orig_bio->bi_iter.bi_sector += >> + md->zwp_offset[zno] - bio_sectors(orig_bio); >> + } >> + spin_unlock_irqrestore(&md->zwp_offset_lock, flags); >> + > > Similar comments to the spinlock here. OK. Thanks for the review. -- Damien Le Moal Western Digital Research -- dm-devel mailing list dm-devel@xxxxxxxxxx https://listman.redhat.com/mailman/listinfo/dm-devel