On 2017/2/28 6:19, Jaegeuk Kim wrote: > On 02/27, Chao Yu wrote: >> On 2017/2/26 2:34, Jaegeuk Kim wrote: >>> On 02/25, Chao Yu wrote: >>>> On 2017/2/24 6:54, Jaegeuk Kim wrote: >>>>> On 02/23, Chao Yu wrote: >>>>>> On 2017/2/14 10:06, Jaegeuk Kim wrote: >>> ... >>>>> >>>>>>> +static int scan_nat_bits(struct f2fs_sb_info *sbi) >>>>>>> +{ >>>>>>> + struct f2fs_nm_info *nm_i = NM_I(sbi); >>>>>>> + struct page *page; >>>>>>> + unsigned int i = 0; >>>>>>> + nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; >>>>>>> + nid_t nid; >>>>>>> + >>>>>>> + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) >>>>>>> + return -EAGAIN; >>>>>>> + >>>>>>> + down_read(&nm_i->nat_tree_lock); >>>>>>> +check_empty: >>>>>>> + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); >>>>>>> + if (i >= nm_i->nat_blocks) { >>>>>>> + i = 0; >>>>>>> + goto check_partial; >>>>>>> + } >>>>>>> + >>>>>>> + for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK; >>>>>>> + nid++) { >>>>>>> + if (unlikely(nid >= nm_i->max_nid)) >>>>>>> + break; >>>>>>> + add_free_nid(sbi, nid, true); >>>>>>> + } >>>>>>> + >>>>>>> + if (nm_i->nid_cnt[FREE_NID_LIST] >= target) >>>>>>> + goto out; >>>>>>> + i++; >>>>>>> + goto check_empty; >>>>>>> + >>>>>>> +check_partial: >>>>>>> + i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); >>>>>>> + if (i >= nm_i->nat_blocks) { >>>>>>> + disable_nat_bits(sbi, true); >>>>>> >>>>>> Can this happen in real world? Should be a bug in somewhere? >>>>> >>>>> It happens, since current design handles full_nat_bits optionally in order >>>>> to avoid scanning a whole NAT page to set it back as 1 from 0. >>>> >>>> All 0 value in full_nat_bits means the NAT block has at least one free nid, >>>> right? so if we traverse all such NAT blocks, and still haven't collect enough >>>> *target* number of free nids, we don't have to disable nat bit cache, we can >>>> just break out here. >>> >>> No, I'm seeing this case: >>> 1. mount with full=0, empty=0, which indicates some free nids. >>> 2. allocate all there-in free nids, but still full=0, empty=0. >>> 3. allocate more free nids. >>> ... >>> ... checkpoint makes full=1, empty=0 >>> >>> If there is no checkpoint and it consumes all the free nids, we can see all 0 >>> value in full_nat_bits which is quite impossible. In that case, I'd prefer >>> to stop nat_bits and give fsck.f2fs to revive it back. >> >> Yeah, I can understand that, but what I concern is when there is few free nids >> (< target), we still try to load nids of 8 NAT blocks until ending the loop of >> caching free nids, so it will be very easy to enter the case of disabling >> nid_bits cache here, so how about doing more check? >> >> if (i >= nm_i->nat_blocks && > > This indicates full_nat_bits consists of all zeros, which means, whatever in > the next time, we need to return -EINVAL. If we do not disable it here, there is > no way to turn any bit into zero. We can expect late checkpoint will refresh full_nat_bits, __update_nat_bits will turn bit to one or zero there. > > BTW, I'm trying to freeze all the patches for a pull request, so let me have > another patch for any pending issues which are not urgent for now as I think. Got it. I will send you patch if I find explicit bug. :) Thanks, > > Thanks, > >> nm_i->nid_cnt[FREE_NID_LIST] != nm_i->available_nids) >> disable_nat_bits >> >> Thanks, >> >>> >>> Thanks, >>> >>>> >>>> Thanks, >>>> >>>>> >>>>>> >>>>>>> + return -EINVAL; >>>>>>> + } >>>>>>> + >>>>>>> + nid = i * NAT_ENTRY_PER_BLOCK; >>>>>>> + page = get_current_nat_page(sbi, nid); >>>>>>> + scan_nat_page(sbi, page, nid); >>>>>>> + f2fs_put_page(page, 1); >>>>>>> + >>>>>>> + if (nm_i->nid_cnt[FREE_NID_LIST] < target) { >>>>>>> + i++; >>>>>>> + goto check_partial; >>>>>>> + } >>>>>>> +out: >>>>>>> + up_read(&nm_i->nat_tree_lock); >>>>>>> + return 0; >>>>>>> +} >>>>>>> + >>>>>>> +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) >>>>>>> { >>>>>>> struct f2fs_nm_info *nm_i = NM_I(sbi); >>>>>>> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); >>>>>>> @@ -1854,6 +1911,20 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) >>>>>>> if (!sync && !available_free_memory(sbi, FREE_NIDS)) >>>>>>> return; >>>>>>> >>>>>>> + /* try to find free nids with nat_bits */ >>>>>>> + if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) >>>>>>> + return; >>>>>>> + >>>>>>> + /* find next valid candidate */ >>>>>> >>>>>> This is just for mount case? >>>>> >>>>> Yup, it reuses free nids in dirty NAT blocks, so that we can make them as full >>>>> NAT pages. >>>>> >>>>> Thanks, >>>>> >>>>>> >>>>>>> + if (is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { >>>>>>> + int idx = find_next_zero_bit_le(nm_i->full_nat_bits, >>>>>>> + nm_i->nat_blocks, 0); >>>>>>> + if (idx >= nm_i->nat_blocks) >>>>>>> + set_sbi_flag(sbi, SBI_NEED_FSCK); >>>>>>> + else >>>>>>> + nid = idx * NAT_ENTRY_PER_BLOCK; >>>>>>> + } >>>>>>> + >>>>>>> /* readahead nat pages to be scanned */ >>>>>>> ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, >>>>>>> META_NAT, true); >>>>>>> @@ -1896,10 +1967,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) >>>>>>> nm_i->ra_nid_pages, META_NAT, false); >>>>>>> } >>>>>>> >>>>>>> -void build_free_nids(struct f2fs_sb_info *sbi, bool sync) >>>>>>> +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) >>>>>>> { >>>>>>> mutex_lock(&NM_I(sbi)->build_lock); >>>>>>> - __build_free_nids(sbi, sync); >>>>>>> + __build_free_nids(sbi, sync, mount); >>>>>>> mutex_unlock(&NM_I(sbi)->build_lock); >>>>>>> } >>>>>>> >>>>>>> @@ -1941,7 +2012,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) >>>>>>> spin_unlock(&nm_i->nid_list_lock); >>>>>>> >>>>>>> /* Let's scan nat pages and its caches to get free nids */ >>>>>>> - build_free_nids(sbi, true); >>>>>>> + build_free_nids(sbi, true, false); >>>>>>> goto retry; >>>>>>> } >>>>>>> >>>>>>> @@ -2233,8 +2304,39 @@ static void __adjust_nat_entry_set(struct nat_entry_set *nes, >>>>>>> list_add_tail(&nes->set_list, head); >>>>>>> } >>>>>>> >>>>>>> +void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, >>>>>>> + struct page *page) >>>>>>> +{ >>>>>>> + struct f2fs_nm_info *nm_i = NM_I(sbi); >>>>>>> + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; >>>>>>> + struct f2fs_nat_block *nat_blk = page_address(page); >>>>>>> + int valid = 0; >>>>>>> + int i; >>>>>>> + >>>>>>> + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) >>>>>>> + return; >>>>>>> + >>>>>>> + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { >>>>>>> + if (start_nid == 0 && i == 0) >>>>>>> + valid++; >>>>>>> + if (nat_blk->entries[i].block_addr) >>>>>>> + valid++; >>>>>>> + } >>>>>>> + if (valid == 0) { >>>>>>> + test_and_set_bit_le(nat_index, nm_i->empty_nat_bits); >>>>>>> + test_and_clear_bit_le(nat_index, nm_i->full_nat_bits); >>>>>> >>>>>> set_bit_le/clear_bit_le >>>>>> >>>>>>> + return; >>>>>>> + } >>>>>>> + >>>>>>> + test_and_clear_bit_le(nat_index, nm_i->empty_nat_bits); >>>>>> >>>>>> ditto >>>>>> >>>>>>> + if (valid == NAT_ENTRY_PER_BLOCK) >>>>>>> + test_and_set_bit_le(nat_index, nm_i->full_nat_bits); >>>>>>> + else >>>>>>> + test_and_clear_bit_le(nat_index, nm_i->full_nat_bits); >>>>>> >>>>>> ditto >>>>>> >>>>>>> +} >>>>>>> + >>>>>>> static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, >>>>>>> - struct nat_entry_set *set) >>>>>>> + struct nat_entry_set *set, struct cp_control *cpc) >>>>>>> { >>>>>>> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); >>>>>>> struct f2fs_journal *journal = curseg->journal; >>>>>>> @@ -2249,7 +2351,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, >>>>>>> * #1, flush nat entries to journal in current hot data summary block. >>>>>>> * #2, flush nat entries to nat page. >>>>>>> */ >>>>>>> - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) >>>>>>> + if (cpc->reason == CP_UMOUNT || >>>>>> >>>>>> if ((cpc->reason == CP_UMOUNT && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) || >>>>>> >>>>>>> + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) >>>>>>> to_journal = false; >>>>>>> >>>>>>> if (to_journal) { >>>>>>> @@ -2289,10 +2392,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, >>>>>>> } >>>>>>> } >>>>>>> >>>>>>> - if (to_journal) >>>>>>> + if (to_journal) { >>>>>>> up_write(&curseg->journal_rwsem); >>>>>>> - else >>>>>>> + } else { >>>>>>> + __update_nat_bits(sbi, start_nid, page); >>>>>>> f2fs_put_page(page, 1); >>>>>>> + } >>>>>>> >>>>>>> f2fs_bug_on(sbi, set->entry_cnt); >>>>>>> >>>>>>> @@ -2303,7 +2408,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, >>>>>>> /* >>>>>>> * This function is called during the checkpointing process. >>>>>>> */ >>>>>>> -void flush_nat_entries(struct f2fs_sb_info *sbi) >>>>>>> +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) >>>>>>> { >>>>>>> struct f2fs_nm_info *nm_i = NM_I(sbi); >>>>>>> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); >>>>>>> @@ -2324,7 +2429,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) >>>>>>> * entries, remove all entries from journal and merge them >>>>>>> * into nat entry set. >>>>>>> */ >>>>>>> - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) >>>>>>> + if (cpc->reason == CP_UMOUNT || >>>>>> >>>>>> if ((cpc->reason == CP_UMOUNT && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) || >>>>>> >>>>>>> + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) >>>>>>> remove_nats_in_journal(sbi); >>>>>>> >>>>>>> while ((found = __gang_lookup_nat_set(nm_i, >>>>>>> @@ -2338,27 +2444,72 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) >>>>>>> >>>>>>> /* flush dirty nats in nat entry set */ >>>>>>> list_for_each_entry_safe(set, tmp, &sets, set_list) >>>>>>> - __flush_nat_entry_set(sbi, set); >>>>>>> + __flush_nat_entry_set(sbi, set, cpc); >>>>>>> >>>>>>> up_write(&nm_i->nat_tree_lock); >>>>>>> >>>>>>> f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); >>>>>>> } >>>>>>> >>>>>>> +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) >>>>>>> +{ >>>>>>> + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); >>>>>>> + struct f2fs_nm_info *nm_i = NM_I(sbi); >>>>>>> + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; >>>>>>> + unsigned int i; >>>>>>> + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); >>>>>> >>>>>> __u64 cp_ver = cur_cp_version(ckpt); >>>>>> >>>>>> Thanks, >>>>>> >>>>>>> + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); >>>>>>> + __u64 crc = le32_to_cpu(*((__le32 *) >>>>>>> + ((unsigned char *)ckpt + crc_offset))); >>>>>>> + block_t nat_bits_addr; >>>>>>> + >>>>>>> + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) >>>>>>> + return 0; >>>>>>> + >>>>>>> + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + >>>>>>> + F2FS_BLKSIZE - 1); >>>>>>> + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, >>>>>>> + GFP_KERNEL); >>>>>>> + if (!nm_i->nat_bits) >>>>>>> + return -ENOMEM; >>>>>>> + >>>>>>> + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - >>>>>>> + nm_i->nat_bits_blocks; >>>>>>> + for (i = 0; i < nm_i->nat_bits_blocks; i++) { >>>>>>> + struct page *page = get_meta_page(sbi, nat_bits_addr++); >>>>>>> + >>>>>>> + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), >>>>>>> + page_address(page), F2FS_BLKSIZE); >>>>>>> + f2fs_put_page(page, 1); >>>>>>> + } >>>>>>> + >>>>>>> + cp_ver |= (crc << 32); >>>>>>> + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { >>>>>>> + disable_nat_bits(sbi, true); >>>>>>> + return 0; >>>>>>> + } >>>>>>> + >>>>>>> + nm_i->full_nat_bits = nm_i->nat_bits + 8; >>>>>>> + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; >>>>>>> + >>>>>>> + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); >>>>>>> + return 0; >>>>>>> +} >>>>>>> + >>>>>>> static int init_node_manager(struct f2fs_sb_info *sbi) >>>>>>> { >>>>>>> struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); >>>>>>> struct f2fs_nm_info *nm_i = NM_I(sbi); >>>>>>> unsigned char *version_bitmap; >>>>>>> - unsigned int nat_segs, nat_blocks; >>>>>>> + unsigned int nat_segs; >>>>>>> + int err; >>>>>>> >>>>>>> nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); >>>>>>> >>>>>>> /* segment_count_nat includes pair segment so divide to 2. */ >>>>>>> nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; >>>>>>> - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); >>>>>>> - >>>>>>> - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; >>>>>>> + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); >>>>>>> + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; >>>>>>> >>>>>>> /* not used nids: 0, node, meta, (and root counted as valid node) */ >>>>>>> nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - >>>>>>> @@ -2392,6 +2543,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) >>>>>>> if (!nm_i->nat_bitmap) >>>>>>> return -ENOMEM; >>>>>>> >>>>>>> + err = __get_nat_bitmaps(sbi); >>>>>>> + if (err) >>>>>>> + return err; >>>>>>> + >>>>>>> #ifdef CONFIG_F2FS_CHECK_FS >>>>>>> nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, >>>>>>> GFP_KERNEL); >>>>>>> @@ -2414,7 +2569,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) >>>>>>> if (err) >>>>>>> return err; >>>>>>> >>>>>>> - build_free_nids(sbi, true); >>>>>>> + build_free_nids(sbi, true, true); >>>>>>> return 0; >>>>>>> } >>>>>>> >>>>>>> @@ -2473,6 +2628,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) >>>>>>> up_write(&nm_i->nat_tree_lock); >>>>>>> >>>>>>> kfree(nm_i->nat_bitmap); >>>>>>> + kfree(nm_i->nat_bits); >>>>>>> #ifdef CONFIG_F2FS_CHECK_FS >>>>>>> kfree(nm_i->nat_bitmap_mir); >>>>>>> #endif >>>>>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c >>>>>>> index df2ff5cfe8f4..8e1ec248c653 100644 >>>>>>> --- a/fs/f2fs/segment.c >>>>>>> +++ b/fs/f2fs/segment.c >>>>>>> @@ -386,7 +386,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) >>>>>>> if (!available_free_memory(sbi, FREE_NIDS)) >>>>>>> try_to_free_nids(sbi, MAX_FREE_NIDS); >>>>>>> else >>>>>>> - build_free_nids(sbi, false); >>>>>>> + build_free_nids(sbi, false, false); >>>>>>> >>>>>>> if (!is_idle(sbi)) >>>>>>> return; >>>>>>> diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h >>>>>>> index f0748524ca8c..1c92ace2e8f8 100644 >>>>>>> --- a/include/linux/f2fs_fs.h >>>>>>> +++ b/include/linux/f2fs_fs.h >>>>>>> @@ -114,6 +114,7 @@ struct f2fs_super_block { >>>>>>> /* >>>>>>> * For checkpoint >>>>>>> */ >>>>>>> +#define CP_NAT_BITS_FLAG 0x00000080 >>>>>>> #define CP_CRC_RECOVERY_FLAG 0x00000040 >>>>>>> #define CP_FASTBOOT_FLAG 0x00000020 >>>>>>> #define CP_FSCK_FLAG 0x00000010 >>>>>>> >>>>>> >>>>>> >>>>>> ------------------------------------------------------------------------------ >>>>>> Check out the vibrant tech community on one of the world's most >>>>>> engaging tech sites, SlashDot.org! http://sdm.link/slashdot >>>>>> _______________________________________________ >>>>>> Linux-f2fs-devel mailing list >>>>>> Linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx >>>>>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel >>>>> >>>>> . >>>>> >>>> >>>> >>>> ------------------------------------------------------------------------------ >>>> Check out the vibrant tech community on one of the world's most >>>> engaging tech sites, SlashDot.org! http://sdm.link/slashdot >>>> _______________________________________________ >>>> Linux-f2fs-devel mailing list >>>> Linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx >>>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel >>> >>> . >>> >> >> >> ------------------------------------------------------------------------------ >> Check out the vibrant tech community on one of the world's most >> engaging tech sites, SlashDot.org! http://sdm.link/slashdot >> _______________________________________________ >> Linux-f2fs-devel mailing list >> Linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx >> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel > > . >