Please find my response inline. Thanks, Aravind > -----Original Message----- > From: Chao Yu <yuchao0@xxxxxxxxxx> > Sent: Wednesday, July 8, 2020 8:04 AM > To: Aravind Ramesh <Aravind.Ramesh@xxxxxxx>; jaegeuk@xxxxxxxxxx; linux- > fsdevel@xxxxxxxxxxxxxxx; linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx; hch@xxxxxx > Cc: Damien Le Moal <Damien.LeMoal@xxxxxxx>; Niklas Cassel > <Niklas.Cassel@xxxxxxx>; Matias Bjorling <Matias.Bjorling@xxxxxxx> > Subject: Re: [PATCH 1/2] f2fs: support zone capacity less than zone size > > On 2020/7/8 2:23, Aravind Ramesh wrote: > > Thanks for review Chao Yu. > > Please find my response inline. > > I will re-send a V2 after incorporating your comments. > > > > Regards, > > Aravind > > > >> -----Original Message----- > >> From: Chao Yu <yuchao0@xxxxxxxxxx> > >> Sent: Tuesday, July 7, 2020 5:49 PM > >> To: Aravind Ramesh <Aravind.Ramesh@xxxxxxx>; jaegeuk@xxxxxxxxxx; > >> linux- fsdevel@xxxxxxxxxxxxxxx; > >> linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx; hch@xxxxxx > >> Cc: Damien Le Moal <Damien.LeMoal@xxxxxxx>; Niklas Cassel > >> <Niklas.Cassel@xxxxxxx>; Matias Bjorling <Matias.Bjorling@xxxxxxx> > >> Subject: Re: [PATCH 1/2] f2fs: support zone capacity less than zone > >> size > >> > >> On 2020/7/2 23:54, Aravind Ramesh wrote: > >>> NVMe Zoned Namespace devices can have zone-capacity less than zone-size. > >>> Zone-capacity indicates the maximum number of sectors that are > >>> usable in a zone beginning from the first sector of the zone. This > >>> makes the sectors sectors after the zone-capacity till zone-size to be unusable. > >>> This patch set tracks zone-size and zone-capacity in zoned devices > >>> and calculate the usable blocks per segment and usable segments per section. > >>> > >>> If zone-capacity is less than zone-size mark only those segments > >>> which start before zone-capacity as free segments. All segments at > >>> and beyond zone-capacity are treated as permanently used segments. > >>> In cases where zone-capacity does not align with segment size the > >>> last segment will start before zone-capacity and end beyond the > >>> zone-capacity of the zone. For such spanning segments only sectors > >>> within the > >> zone-capacity are used. > >>> > >>> Signed-off-by: Aravind Ramesh <aravind.ramesh@xxxxxxx> > >>> Signed-off-by: Damien Le Moal <damien.lemoal@xxxxxxx> > >>> Signed-off-by: Niklas Cassel <niklas.cassel@xxxxxxx> > >>> --- > >>> fs/f2fs/f2fs.h | 5 ++ > >>> fs/f2fs/segment.c | 136 > >> ++++++++++++++++++++++++++++++++++++++++++++-- > >>> fs/f2fs/segment.h | 6 +- > >>> fs/f2fs/super.c | 41 ++++++++++++-- > >>> 4 files changed, 176 insertions(+), 12 deletions(-) > >>> > >>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index > >>> e6e47618a357..73219e4e1ba4 100644 > >>> --- a/fs/f2fs/f2fs.h > >>> +++ b/fs/f2fs/f2fs.h > >>> @@ -1232,6 +1232,7 @@ struct f2fs_dev_info { #ifdef > >>> CONFIG_BLK_DEV_ZONED > >>> unsigned int nr_blkz; /* Total number of zones */ > >>> unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ > >>> + block_t *zone_capacity_blocks; /* Array of zone capacity in blks > >>> +*/ > >>> #endif > >>> }; > >>> > >>> @@ -3395,6 +3396,10 @@ void > >>> f2fs_destroy_segment_manager_caches(void); > >>> int f2fs_rw_hint_to_seg_type(enum rw_hint hint); enum rw_hint > >>> f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, > >>> enum page_type type, enum temp_type temp); > >>> +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, > >>> + unsigned int segno); > >>> +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, > >>> + unsigned int segno); > >>> > >>> /* > >>> * checkpoint.c > >>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index > >>> c35614d255e1..d2156f3f56a5 100644 > >>> --- a/fs/f2fs/segment.c > >>> +++ b/fs/f2fs/segment.c > >>> @@ -4294,9 +4294,12 @@ static void init_free_segmap(struct > >>> f2fs_sb_info *sbi) { > >>> unsigned int start; > >>> int type; > >>> + struct seg_entry *sentry; > >>> > >>> for (start = 0; start < MAIN_SEGS(sbi); start++) { > >>> - struct seg_entry *sentry = get_seg_entry(sbi, start); > >>> + if (f2fs_usable_blks_in_seg(sbi, start) == 0) > >> > >> If usable blocks count is zero, shouldn't we update > >> SIT_I(sbi)->written_valid_blocks as we did when there is partial usable block in > current segment? > > If usable_block_count is zero, then it is like a dead segment, all > > blocks in the segment lie after the zone-capacity in the zone. So there can never be > a valid written content on these segments, hence it is not updated. > > In the other case, when a segment start before the zone-capacity and > > it ends beyond zone-capacity, then there are some blocks before zone-capacity > which can be used, so they are accounted for. > > I'm thinking that for limit_free_user_blocks() function, it assumes all unwritten > blocks as potential reclaimable blocks, however segment after zone-capacity should > never be used or reclaimable, it looks calculation could be not correct here. > The sbi->user_block_count is updated with the total usable_blocks in the full file system during the formatting of the file system using mkfs.f2fs. Please see the f2fs-tools patch series that I have submitted along with this patch set. So sbi->user_block_count reflects the actual number of usable blocks (i.e. total blocks - unusable blocks). > static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) { > block_t reclaimable_user_blocks = sbi->user_block_count - > written_block_count(sbi); > return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } > > static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) { > block_t invalid_user_blocks = sbi->user_block_count - > written_block_count(sbi); > /* > * Background GC is triggered with the following conditions. > * 1. There are a number of invalid blocks. > * 2. There is not enough free space. > */ > if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && > free_user_blocks(sbi) < limit_free_user_blocks(sbi)) > > -- In this condition, free_user_blocks() doesn't include segments after zone-capacity, > however limit_free_user_blocks() includes them. In the second patch of this patch set, free_user_blocks is updated to account for the segments after zone-capacity. It basically gets the free segment(segments before zone capacity and free) block count and deducts the overprovision segment block count. It also considers the spanning segments block count into account. > > return true; > return false; > } > > > >> > >>> + continue; > >>> + sentry = get_seg_entry(sbi, start); > >>> if (!sentry->valid_blocks) > >>> __set_free(sbi, start); > >>> else > >>> @@ -4316,7 +4319,7 @@ static void init_dirty_segmap(struct f2fs_sb_info > *sbi) > >>> struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); > >>> struct free_segmap_info *free_i = FREE_I(sbi); > >>> unsigned int segno = 0, offset = 0, secno; > >>> - unsigned short valid_blocks; > >>> + unsigned short valid_blocks, usable_blks_in_seg; > >>> unsigned short blks_per_sec = BLKS_PER_SEC(sbi); > >>> > >>> while (1) { > >>> @@ -4326,9 +4329,10 @@ static void init_dirty_segmap(struct f2fs_sb_info > *sbi) > >>> break; > >>> offset = segno + 1; > >>> valid_blocks = get_valid_blocks(sbi, segno, false); > >>> - if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) > >>> + usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); > >>> + if (valid_blocks == usable_blks_in_seg || !valid_blocks) > >> > >> It needs to traverse .cur_valid_map bitmap to check whether blocks in > >> range of [0, usable_blks_in_seg] are all valid or not, if there is at > >> least one usable block in the range, segment should be dirty. > > For the segments which start and end before zone-capacity are just like any > normal segments. > > Segments which start after the zone-capacity are fully unusable and are marked as > used in the free_seg_bitmap, so these segments are never used. > > Segments which span across the zone-capacity have some unusable blocks. Even > when blocks from these segments are allocated/deallocated the valid_blocks > counter is incremented/decremented, reflecting the current valid_blocks count. > > Comparing valid_blocks count with usable_blocks count in the segment can > indicate if the segment is dirty or fully used. > > I thought that if there is one valid block locates in range of [usable_blks_in_seg, > blks_per_seg] (after zone-capacity), the condition will be incorrect. That should > never happen, right? Yes, this will never happen. All blocks after zone-capacity are never usable. > > If so, how about adjusting check_block_count() to do sanity check on bitmap locates > after zone-capacity to make sure there is no free slots there. Ok, I will add this check in check_block_count. It makes sense. > > > Sorry, but could you please share why cur_valid_map needs to be traversed ? > > > >> > >> One question, if we select dirty segment which across zone-capacity > >> as opened segment (in curseg), how can we avoid allocating usable > >> block beyong zone-capacity in such segment via .cur_valid_map? > > For zoned devices, we have to allocate blocks sequentially, so it's always in LFS > manner it is allocated. > > The __has_curseg_space() checks for the usable blocks and stops allocating blocks > after zone-capacity. > > Oh, that was implemented in patch 2, I haven't checked that patch...sorry, however, > IMO, patch should be made to apply independently, what if do allocation only after > applying patch 1..., do we need to merge them into one? The patches were split keeping in mind that all data structure related and initialization Changes would go into patch 1 and IO path and GC related changes in patch 2. But if you think, merging them to a single patch will be easier to review, then I shall merge it and send it as one patch in V2, along with other suggestions incorporated. Please let me know. > > >> > >>> continue; > >>> - if (valid_blocks > sbi->blocks_per_seg) { > >>> + if (valid_blocks > usable_blks_in_seg) { > >>> f2fs_bug_on(sbi, 1); > >>> continue; > >>> } > >>> @@ -4678,6 +4682,101 @@ int f2fs_check_write_pointer(struct > >>> f2fs_sb_info *sbi) > >>> > >>> return 0; > >>> } > >>> + > >>> +static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx, > >>> + unsigned int dev_idx) > >>> +{ > >>> + if (!bdev_is_zoned(FDEV(dev_idx).bdev)) > >>> + return true; > >>> + return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq); } > >>> + > >>> +/* Return the zone index in the given device */ static unsigned int > >>> +get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, > >>> + int dev_idx) > >>> +{ > >>> + block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, > >>> +secno)); > >>> + > >>> + return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >> > >>> + sbi->log_blocks_per_blkz; > >>> +} > >>> + > >>> +/* > >>> + * Return the usable segments in a section based on the zone's > >>> + * corresponding zone capacity. Zone is equal to a section. > >>> + */ > >>> +static inline unsigned int f2fs_usable_zone_segs_in_sec( > >>> + struct f2fs_sb_info *sbi, unsigned int segno) { > >>> + unsigned int dev_idx, zone_idx, unusable_segs_in_sec; > >>> + > >>> + dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); > >>> + zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), > >>> +dev_idx); > >>> + > >>> + /* Conventional zone's capacity is always equal to zone size */ > >>> + if (is_conv_zone(sbi, zone_idx, dev_idx)) > >>> + return sbi->segs_per_sec; > >>> + > >>> + /* > >>> + * If the zone_capacity_blocks array is NULL, then zone capacity > >>> + * is equal to the zone size for all zones > >>> + */ > >>> + if (!FDEV(dev_idx).zone_capacity_blocks) > >>> + return sbi->segs_per_sec; > >>> + > >>> + /* Get the segment count beyond zone capacity block */ > >>> + unusable_segs_in_sec = (sbi->blocks_per_blkz - > >>> + FDEV(dev_idx).zone_capacity_blocks[zone_idx]) > >>>> > >>> + sbi->log_blocks_per_seg; > >>> + return sbi->segs_per_sec - unusable_segs_in_sec; } > >>> + > >>> +/* > >>> + * Return the number of usable blocks in a segment. The number of > >>> +blocks > >>> + * returned is always equal to the number of blocks in a segment > >>> +for > >>> + * segments fully contained within a sequential zone capacity or a > >>> + * conventional zone. For segments partially contained in a > >>> +sequential > >>> + * zone capacity, the number of usable blocks up to the zone > >>> +capacity > >>> + * is returned. 0 is returned in all other cases. > >>> + */ > >>> +static inline unsigned int f2fs_usable_zone_blks_in_seg( > >>> + struct f2fs_sb_info *sbi, unsigned int segno) { > >>> + block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; > >>> + unsigned int zone_idx, dev_idx, secno; > >>> + > >>> + secno = GET_SEC_FROM_SEG(sbi, segno); > >>> + seg_start = START_BLOCK(sbi, segno); > >>> + dev_idx = f2fs_target_device_index(sbi, seg_start); > >>> + zone_idx = get_zone_idx(sbi, secno, dev_idx); > >>> + > >>> + /* > >>> + * Conventional zone's capacity is always equal to zone size, > >>> + * so, blocks per segment is unchanged. > >>> + */ > >>> + if (is_conv_zone(sbi, zone_idx, dev_idx)) > >>> + return sbi->blocks_per_seg; > >>> + > >>> + if (!FDEV(dev_idx).zone_capacity_blocks) > >>> + return sbi->blocks_per_seg; > >>> + > >>> + sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); > >>> + sec_cap_blkaddr = sec_start_blkaddr + > >>> + FDEV(dev_idx).zone_capacity_blocks[zone_idx]; > >>> + > >>> + /* > >>> + * If segment starts before zone capacity and spans beyond > >>> + * zone capacity, then usable blocks are from seg start to > >>> + * zone capacity. If the segment starts after the zone capacity, > >>> + * then there are no usable blocks. > >>> + */ > >>> + if (seg_start >= sec_cap_blkaddr) > >>> + return 0; > >>> + if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr) > >>> + return sec_cap_blkaddr - seg_start; > >>> + > >>> + return sbi->blocks_per_seg; > >>> +} > >>> #else > >>> int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { @@ > >>> -4688,7 +4787,36 @@ int f2fs_check_write_pointer(struct f2fs_sb_info > >>> *sbi) { > >>> return 0; > >>> } > >>> + > >>> +static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info > *sbi, > >>> + unsigned int segno) > >>> +{ > >>> + return 0; > >>> +} > >>> + > >>> +static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info > *sbi, > >>> + unsigned int segno) > >>> +{ > >>> + return 0; > >>> +} > >>> #endif > >>> +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, > >>> + unsigned int segno) > >>> +{ > >>> + if (f2fs_sb_has_blkzoned(sbi)) > >>> + return f2fs_usable_zone_blks_in_seg(sbi, segno); > >>> + > >>> + return sbi->blocks_per_seg; > >>> +} > >>> + > >>> +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, > >>> + unsigned int segno) > >>> +{ > >>> + if (f2fs_sb_has_blkzoned(sbi)) > >>> + return f2fs_usable_zone_segs_in_sec(sbi, segno); > >>> + > >>> + return sbi->segs_per_sec; > >>> +} > >>> > >>> /* > >>> * Update min, max modified time for cost-benefit GC algorithm diff > >>> --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index > >>> f261e3e6a69b..79b0dc33feaf 100644 > >>> --- a/fs/f2fs/segment.h > >>> +++ b/fs/f2fs/segment.h > >>> @@ -411,6 +411,7 @@ static inline void __set_free(struct > >>> f2fs_sb_info *sbi, > >> unsigned int segno) > >>> unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); > >>> unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); > >>> unsigned int next; > >>> + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); > >>> > >>> spin_lock(&free_i->segmap_lock); > >>> clear_bit(segno, free_i->free_segmap); @@ -418,7 +419,7 @@ static > >>> inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) > >>> > >>> next = find_next_bit(free_i->free_segmap, > >>> start_segno + sbi->segs_per_sec, start_segno); > >>> - if (next >= start_segno + sbi->segs_per_sec) { > >>> + if (next >= start_segno + usable_segs) { > >>> clear_bit(secno, free_i->free_secmap); > >>> free_i->free_sections++; > >>> } > >>> @@ -444,6 +445,7 @@ static inline void __set_test_and_free(struct > >>> f2fs_sb_info > >> *sbi, > >>> unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); > >>> unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); > >>> unsigned int next; > >>> + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); > >>> > >>> spin_lock(&free_i->segmap_lock); > >>> if (test_and_clear_bit(segno, free_i->free_segmap)) { @@ -453,7 > >>> +455,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info > >>> +*sbi, > >>> goto skip_free; > >>> next = find_next_bit(free_i->free_segmap, > >>> start_segno + sbi->segs_per_sec, start_segno); > >>> - if (next >= start_segno + sbi->segs_per_sec) { > >>> + if (next >= start_segno + usable_segs) { > >>> if (test_and_clear_bit(secno, free_i->free_secmap)) > >>> free_i->free_sections++; > >>> } > >>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index > >>> 80cb7cd358f8..2686b07ae7eb 100644 > >>> --- a/fs/f2fs/super.c > >>> +++ b/fs/f2fs/super.c > >>> @@ -1164,6 +1164,7 @@ static void destroy_device_list(struct f2fs_sb_info > *sbi) > >>> blkdev_put(FDEV(i).bdev, FMODE_EXCL); #ifdef > >> CONFIG_BLK_DEV_ZONED > >>> kvfree(FDEV(i).blkz_seq); > >>> + kvfree(FDEV(i).zone_capacity_blocks); > >> > >> Now, f2fs_kzalloc won't allocate vmalloc's memory, so it's safe to use kfree(). > > Ok > >> > >>> #endif > >>> } > >>> kvfree(sbi->devs); > >>> @@ -3039,13 +3040,26 @@ static int init_percpu_info(struct > >>> f2fs_sb_info *sbi) } > >>> > >>> #ifdef CONFIG_BLK_DEV_ZONED > >>> + > >>> +struct f2fs_report_zones_args { > >>> + struct f2fs_dev_info *dev; > >>> + bool zone_cap_mismatch; > >>> +}; > >>> + > >>> static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, > >>> - void *data) > >>> + void *data) > >>> { > >>> - struct f2fs_dev_info *dev = data; > >>> + struct f2fs_report_zones_args *rz_args = data; > >>> + > >>> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) > >>> + return 0; > >>> + > >>> + set_bit(idx, rz_args->dev->blkz_seq); > >>> + rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >> > >>> + F2FS_LOG_SECTORS_PER_BLOCK; > >>> + if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch) > >>> + rz_args->zone_cap_mismatch = true; > >>> > >>> - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) > >>> - set_bit(idx, dev->blkz_seq); > >>> return 0; > >>> } > >>> > >>> @@ -3053,6 +3067,7 @@ static int init_blkz_info(struct f2fs_sb_info > >>> *sbi, int devi) { > >>> struct block_device *bdev = FDEV(devi).bdev; > >>> sector_t nr_sectors = bdev->bd_part->nr_sects; > >>> + struct f2fs_report_zones_args rep_zone_arg; > >>> int ret; > >>> > >>> if (!f2fs_sb_has_blkzoned(sbi)) > >>> @@ -3078,12 +3093,26 @@ static int init_blkz_info(struct > >>> f2fs_sb_info *sbi, int > >> devi) > >>> if (!FDEV(devi).blkz_seq) > >>> return -ENOMEM; > >>> > >>> - /* Get block zones type */ > >>> + /* Get block zones type and zone-capacity */ > >>> + FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi, > >>> + FDEV(devi).nr_blkz * sizeof(block_t), > >>> + GFP_KERNEL); > >>> + if (!FDEV(devi).zone_capacity_blocks) > >>> + return -ENOMEM; > >>> + > >>> + rep_zone_arg.dev = &FDEV(devi); > >>> + rep_zone_arg.zone_cap_mismatch = false; > >>> + > >>> ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, > >>> - &FDEV(devi)); > >>> + &rep_zone_arg); > >>> if (ret < 0) > >>> return ret; > >> > >> Missed to call kfree(FDEV(devi).zone_capacity_blocks)? > > Thanks for catching it. Will free it here also. > >> > >>> > >>> + if (!rep_zone_arg.zone_cap_mismatch) { > >>> + kvfree(FDEV(devi).zone_capacity_blocks); > >> > >> Ditto, kfree(). > > Ok. > >> > >> Thanks, > >> > >>> + FDEV(devi).zone_capacity_blocks = NULL; > >>> + } > >>> + > >>> return 0; > >>> } > >>> #endif > >>> > > . > >